diff --git "a/CompeteSMoE/competesmoe_versions/competesmoev32/trainer_state.json" "b/CompeteSMoE/competesmoe_versions/competesmoev32/trainer_state.json" new file mode 100644--- /dev/null +++ "b/CompeteSMoE/competesmoe_versions/competesmoev32/trainer_state.json" @@ -0,0 +1,124783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999398785546805, + "eval_steps": 500, + "global_step": 8316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.04592296, + "auxiliary_loss_mlp": 0.0257779, + "balance_loss_clip": 2.47145319, + "balance_loss_mlp": 2.09008121, + "epoch": 0.00012024289063909097, + "flos": 24932483919360.0, + "grad_norm": 40.29560017448091, + "language_loss": 2.5798173, + "learning_rate": 0.0, + "loss": 1.90189219, + "num_input_tokens_seen": 20375, + "step": 1, + "time_per_iteration": 13.533031463623047 + }, + { + "auxiliary_loss_clip": 0.03096462, + "auxiliary_loss_mlp": 0.016449, + "balance_loss_clip": 1.65451014, + "balance_loss_mlp": 1.32103169, + "epoch": 0.00024048578127818193, + "flos": 30664624377600.0, + "grad_norm": 55.94489656313446, + "language_loss": 1.89316106, + "learning_rate": 5.021476677069823e-07, + "loss": 1.94057465, + "num_input_tokens_seen": 39035, + "step": 2, + "time_per_iteration": 2.4680161476135254 + }, + { + "auxiliary_loss_clip": 0.03069692, + "auxiliary_loss_mlp": 0.01669571, + "balance_loss_clip": 1.65272069, + "balance_loss_mlp": 1.34818316, + "epoch": 0.0003607286719172729, + "flos": 19026227969280.0, + "grad_norm": 40.107296391464416, + "language_loss": 1.61673808, + "learning_rate": 7.958852231401551e-07, + "loss": 1.66413069, + "num_input_tokens_seen": 57600, + "step": 3, + "time_per_iteration": 2.34382700920105 + }, + { + "auxiliary_loss_clip": 0.03077994, + "auxiliary_loss_mlp": 0.0173602, + "balance_loss_clip": 1.65171075, + "balance_loss_mlp": 1.41310573, + "epoch": 0.00048097156255636386, + "flos": 19316314206720.0, + "grad_norm": 36.87690991080161, + "language_loss": 1.64510322, + "learning_rate": 1.0042953354139647e-06, + "loss": 1.69324338, + "num_input_tokens_seen": 76465, + "step": 4, + "time_per_iteration": 2.4014604091644287 + }, + { + "auxiliary_loss_clip": 0.03073864, + "auxiliary_loss_mlp": 0.01667524, + "balance_loss_clip": 1.65356755, + "balance_loss_mlp": 1.35071325, + "epoch": 0.0006012144531954548, + "flos": 13991264893440.0, + "grad_norm": 55.24854794788945, + "language_loss": 1.93569183, + "learning_rate": 1.1659507774310057e-06, + "loss": 1.98310578, + "num_input_tokens_seen": 94350, + "step": 5, + "time_per_iteration": 2.6521339416503906 + }, + { + "auxiliary_loss_clip": 0.03084522, + "auxiliary_loss_mlp": 0.01682753, + "balance_loss_clip": 1.65810752, + "balance_loss_mlp": 1.36117351, + "epoch": 0.0007214573438345458, + "flos": 23148988225920.0, + "grad_norm": 45.194418366608836, + "language_loss": 1.61075056, + "learning_rate": 1.2980328908471373e-06, + "loss": 1.6584233, + "num_input_tokens_seen": 114595, + "step": 6, + "time_per_iteration": 2.793560028076172 + }, + { + "auxiliary_loss_clip": 0.03138458, + "auxiliary_loss_mlp": 0.01606507, + "balance_loss_clip": 1.79562807, + "balance_loss_mlp": 1.40661681, + "epoch": 0.0008417002344736367, + "flos": 67663246170240.0, + "grad_norm": 4.613852836749538, + "language_loss": 0.81504482, + "learning_rate": 1.4097067265369432e-06, + "loss": 0.86249447, + "num_input_tokens_seen": 179590, + "step": 7, + "time_per_iteration": 3.0909554958343506 + }, + { + "auxiliary_loss_clip": 0.0305004, + "auxiliary_loss_mlp": 0.0170907, + "balance_loss_clip": 1.64211285, + "balance_loss_mlp": 1.39588332, + "epoch": 0.0009619431251127277, + "flos": 21281381504640.0, + "grad_norm": 49.659725106120916, + "language_loss": 1.582201, + "learning_rate": 1.506443003120947e-06, + "loss": 1.62979209, + "num_input_tokens_seen": 195090, + "step": 8, + "time_per_iteration": 2.709663152694702 + }, + { + "auxiliary_loss_clip": 0.03055894, + "auxiliary_loss_mlp": 0.01697121, + "balance_loss_clip": 1.64996243, + "balance_loss_mlp": 1.37897575, + "epoch": 0.0010821860157518186, + "flos": 23331342597120.0, + "grad_norm": 17.508890825397188, + "language_loss": 1.47852767, + "learning_rate": 1.5917704462803102e-06, + "loss": 1.52605772, + "num_input_tokens_seen": 211635, + "step": 9, + "time_per_iteration": 2.653956174850464 + }, + { + "auxiliary_loss_clip": 0.03042046, + "auxiliary_loss_mlp": 0.01654968, + "balance_loss_clip": 1.64881778, + "balance_loss_mlp": 1.3379668, + "epoch": 0.0012024289063909096, + "flos": 17010166337280.0, + "grad_norm": 13.44495182275048, + "language_loss": 1.52919805, + "learning_rate": 1.6680984451379884e-06, + "loss": 1.57616818, + "num_input_tokens_seen": 224705, + "step": 10, + "time_per_iteration": 2.69167423248291 + }, + { + "auxiliary_loss_clip": 0.03049854, + "auxiliary_loss_mlp": 0.01682941, + "balance_loss_clip": 1.64465737, + "balance_loss_mlp": 1.37852824, + "epoch": 0.0013226717970300007, + "flos": 21288133261440.0, + "grad_norm": 14.412222420014784, + "language_loss": 1.3266654, + "learning_rate": 1.7371455188905097e-06, + "loss": 1.3739934, + "num_input_tokens_seen": 244635, + "step": 11, + "time_per_iteration": 2.680786371231079 + }, + { + "auxiliary_loss_clip": 0.03064559, + "auxiliary_loss_mlp": 0.0170214, + "balance_loss_clip": 1.64962065, + "balance_loss_mlp": 1.37445784, + "epoch": 0.0014429146876690916, + "flos": 27237884935680.0, + "grad_norm": 11.478639317387843, + "language_loss": 1.25377536, + "learning_rate": 1.8001805585541196e-06, + "loss": 1.30144238, + "num_input_tokens_seen": 265765, + "step": 12, + "time_per_iteration": 2.7420272827148438 + }, + { + "auxiliary_loss_clip": 0.03044011, + "auxiliary_loss_mlp": 0.01663496, + "balance_loss_clip": 1.64189076, + "balance_loss_mlp": 1.35603178, + "epoch": 0.0015631575783081825, + "flos": 19062174504960.0, + "grad_norm": 6.593714778584316, + "language_loss": 1.29324901, + "learning_rate": 1.8581671739548328e-06, + "loss": 1.34032416, + "num_input_tokens_seen": 283500, + "step": 13, + "time_per_iteration": 2.669037103652954 + }, + { + "auxiliary_loss_clip": 0.03039403, + "auxiliary_loss_mlp": 0.01618412, + "balance_loss_clip": 1.63795567, + "balance_loss_mlp": 1.30617929, + "epoch": 0.0016834004689472734, + "flos": 48139473985920.0, + "grad_norm": 6.260191131609957, + "language_loss": 1.1350081, + "learning_rate": 1.9118543942439254e-06, + "loss": 1.18158627, + "num_input_tokens_seen": 305685, + "step": 14, + "time_per_iteration": 2.8571760654449463 + }, + { + "auxiliary_loss_clip": 0.03015065, + "auxiliary_loss_mlp": 0.01677578, + "balance_loss_clip": 1.6330477, + "balance_loss_mlp": 1.36076784, + "epoch": 0.0018036433595863645, + "flos": 34970026314240.0, + "grad_norm": 5.78064015765431, + "language_loss": 1.12634933, + "learning_rate": 1.961836000571161e-06, + "loss": 1.17327571, + "num_input_tokens_seen": 327340, + "step": 15, + "time_per_iteration": 2.760812997817993 + }, + { + "auxiliary_loss_clip": 0.03027068, + "auxiliary_loss_mlp": 0.01439522, + "balance_loss_clip": 1.77060544, + "balance_loss_mlp": 1.25031328, + "epoch": 0.0019238862502254555, + "flos": 59768284440960.0, + "grad_norm": 3.756658059109441, + "language_loss": 0.64692342, + "learning_rate": 2.0085906708279293e-06, + "loss": 0.69158936, + "num_input_tokens_seen": 382710, + "step": 16, + "time_per_iteration": 5.50650429725647 + }, + { + "auxiliary_loss_clip": 0.03001511, + "auxiliary_loss_mlp": 0.01635746, + "balance_loss_clip": 1.63701475, + "balance_loss_mlp": 1.32484829, + "epoch": 0.0020441291408645466, + "flos": 20814543417600.0, + "grad_norm": 4.349171996444234, + "language_loss": 1.160671, + "learning_rate": 2.0525099325728135e-06, + "loss": 1.20704341, + "num_input_tokens_seen": 400890, + "step": 17, + "time_per_iteration": 2.6450772285461426 + }, + { + "auxiliary_loss_clip": 0.02990377, + "auxiliary_loss_mlp": 0.01400776, + "balance_loss_clip": 1.76002371, + "balance_loss_mlp": 1.21461892, + "epoch": 0.0021643720315036373, + "flos": 63857001582720.0, + "grad_norm": 3.5101266357191325, + "language_loss": 0.72124314, + "learning_rate": 2.0939181139872922e-06, + "loss": 0.76515466, + "num_input_tokens_seen": 462605, + "step": 18, + "time_per_iteration": 3.085275173187256 + }, + { + "auxiliary_loss_clip": 0.02970693, + "auxiliary_loss_mlp": 0.01583358, + "balance_loss_clip": 1.63045931, + "balance_loss_mlp": 1.28276038, + "epoch": 0.0022846149221427284, + "flos": 31284981192960.0, + "grad_norm": 4.7102787974699085, + "language_loss": 1.01558769, + "learning_rate": 2.1330868934640175e-06, + "loss": 1.06112814, + "num_input_tokens_seen": 483280, + "step": 19, + "time_per_iteration": 2.696502208709717 + }, + { + "auxiliary_loss_clip": 0.02941308, + "auxiliary_loss_mlp": 0.01372513, + "balance_loss_clip": 1.74952602, + "balance_loss_mlp": 1.19017053, + "epoch": 0.002404857812781819, + "flos": 51083648161920.0, + "grad_norm": 3.555214763055384, + "language_loss": 0.76406908, + "learning_rate": 2.170246112844971e-06, + "loss": 0.80720729, + "num_input_tokens_seen": 537620, + "step": 20, + "time_per_iteration": 2.8700852394104004 + }, + { + "auxiliary_loss_clip": 0.02914774, + "auxiliary_loss_mlp": 0.01538777, + "balance_loss_clip": 1.61850083, + "balance_loss_mlp": 1.23951364, + "epoch": 0.0025251007034209102, + "flos": 15815347309440.0, + "grad_norm": 4.735304505590101, + "language_loss": 1.01388359, + "learning_rate": 2.2055919496770983e-06, + "loss": 1.05841899, + "num_input_tokens_seen": 555760, + "step": 21, + "time_per_iteration": 2.647648811340332 + }, + { + "auxiliary_loss_clip": 0.02901449, + "auxiliary_loss_mlp": 0.01523067, + "balance_loss_clip": 1.61415291, + "balance_loss_mlp": 1.22533035, + "epoch": 0.0026453435940600014, + "flos": 37851857458560.0, + "grad_norm": 5.2540440972606985, + "language_loss": 0.8961674, + "learning_rate": 2.2392931865974923e-06, + "loss": 0.94041252, + "num_input_tokens_seen": 578450, + "step": 22, + "time_per_iteration": 2.8207290172576904 + }, + { + "auxiliary_loss_clip": 0.02862227, + "auxiliary_loss_mlp": 0.01506672, + "balance_loss_clip": 1.60537946, + "balance_loss_mlp": 1.21026993, + "epoch": 0.002765586484699092, + "flos": 21141976821120.0, + "grad_norm": 4.620206115742967, + "language_loss": 1.01830506, + "learning_rate": 2.271496085962064e-06, + "loss": 1.06199408, + "num_input_tokens_seen": 596145, + "step": 23, + "time_per_iteration": 2.69580078125 + }, + { + "auxiliary_loss_clip": 0.02834849, + "auxiliary_loss_mlp": 0.01483805, + "balance_loss_clip": 1.59239292, + "balance_loss_mlp": 1.19102716, + "epoch": 0.002885829375338183, + "flos": 20667381396480.0, + "grad_norm": 3.2009452444972335, + "language_loss": 1.02576804, + "learning_rate": 2.3023282262611022e-06, + "loss": 1.06895471, + "num_input_tokens_seen": 614920, + "step": 24, + "time_per_iteration": 2.6719298362731934 + }, + { + "auxiliary_loss_clip": 0.02845174, + "auxiliary_loss_mlp": 0.01484913, + "balance_loss_clip": 1.60035145, + "balance_loss_mlp": 1.20014632, + "epoch": 0.003006072265977274, + "flos": 34823869873920.0, + "grad_norm": 3.8772769835335987, + "language_loss": 0.92433882, + "learning_rate": 2.3319015548620114e-06, + "loss": 0.96763968, + "num_input_tokens_seen": 636060, + "step": 25, + "time_per_iteration": 2.8007843494415283 + }, + { + "auxiliary_loss_clip": 0.02802626, + "auxiliary_loss_mlp": 0.01451476, + "balance_loss_clip": 1.58807039, + "balance_loss_mlp": 1.17510104, + "epoch": 0.003126315156616365, + "flos": 24422021118720.0, + "grad_norm": 2.271328937384169, + "language_loss": 0.92861021, + "learning_rate": 2.3603148416618152e-06, + "loss": 0.97115123, + "num_input_tokens_seen": 655575, + "step": 26, + "time_per_iteration": 2.705730438232422 + }, + { + "auxiliary_loss_clip": 0.02808334, + "auxiliary_loss_mlp": 0.01435545, + "balance_loss_clip": 1.58833385, + "balance_loss_mlp": 1.16241288, + "epoch": 0.003246558047255456, + "flos": 23622326674560.0, + "grad_norm": 2.294819173954996, + "language_loss": 1.00859427, + "learning_rate": 2.3876556694204647e-06, + "loss": 1.05103302, + "num_input_tokens_seen": 675730, + "step": 27, + "time_per_iteration": 2.707465410232544 + }, + { + "auxiliary_loss_clip": 0.02768539, + "auxiliary_loss_mlp": 0.01436921, + "balance_loss_clip": 1.58165073, + "balance_loss_mlp": 1.15062785, + "epoch": 0.003366800937894547, + "flos": 17820275725440.0, + "grad_norm": 2.5769542258057805, + "language_loss": 0.90632999, + "learning_rate": 2.414002061950908e-06, + "loss": 0.94838458, + "num_input_tokens_seen": 694605, + "step": 28, + "time_per_iteration": 2.656536340713501 + }, + { + "auxiliary_loss_clip": 0.0275307, + "auxiliary_loss_mlp": 0.01410387, + "balance_loss_clip": 1.5744226, + "balance_loss_mlp": 1.14259565, + "epoch": 0.003487043828533638, + "flos": 24426115269120.0, + "grad_norm": 2.3808094420721275, + "language_loss": 0.9981423, + "learning_rate": 2.4394238264681557e-06, + "loss": 1.0397768, + "num_input_tokens_seen": 714340, + "step": 29, + "time_per_iteration": 2.68894624710083 + }, + { + "auxiliary_loss_clip": 0.02727111, + "auxiliary_loss_mlp": 0.01410366, + "balance_loss_clip": 1.56732988, + "balance_loss_mlp": 1.1368525, + "epoch": 0.003607286719172729, + "flos": 26140311002880.0, + "grad_norm": 2.0666836282045256, + "language_loss": 0.99509072, + "learning_rate": 2.4639836682781433e-06, + "loss": 1.03646541, + "num_input_tokens_seen": 734470, + "step": 30, + "time_per_iteration": 2.7190186977386475 + }, + { + "auxiliary_loss_clip": 0.02741221, + "auxiliary_loss_mlp": 0.01397796, + "balance_loss_clip": 1.5814749, + "balance_loss_mlp": 1.11760616, + "epoch": 0.00372752960981182, + "flos": 20593082113920.0, + "grad_norm": 3.2524685202632178, + "language_loss": 1.00112808, + "learning_rate": 2.487738122623307e-06, + "loss": 1.04251838, + "num_input_tokens_seen": 753380, + "step": 31, + "time_per_iteration": 2.6225290298461914 + }, + { + "auxiliary_loss_clip": 0.02697098, + "auxiliary_loss_mlp": 0.01375693, + "balance_loss_clip": 1.56319976, + "balance_loss_mlp": 1.10332382, + "epoch": 0.003847772500450911, + "flos": 22674608282880.0, + "grad_norm": 2.45100144899735, + "language_loss": 0.98840976, + "learning_rate": 2.510738338534912e-06, + "loss": 1.02913761, + "num_input_tokens_seen": 772105, + "step": 32, + "time_per_iteration": 2.649449348449707 + }, + { + "auxiliary_loss_clip": 0.02559674, + "auxiliary_loss_mlp": 0.01359604, + "balance_loss_clip": 1.52039504, + "balance_loss_mlp": 1.09200263, + "epoch": 0.003968015391090002, + "flos": 17967796882560.0, + "grad_norm": 2.721918852117027, + "language_loss": 1.02625132, + "learning_rate": 2.5330307420306648e-06, + "loss": 1.06544399, + "num_input_tokens_seen": 788955, + "step": 33, + "time_per_iteration": 2.6268293857574463 + }, + { + "auxiliary_loss_clip": 0.02520656, + "auxiliary_loss_mlp": 0.01344013, + "balance_loss_clip": 1.51562619, + "balance_loss_mlp": 1.10006297, + "epoch": 0.004088258281729093, + "flos": 27304103658240.0, + "grad_norm": 2.643964959570546, + "language_loss": 0.88083756, + "learning_rate": 2.554657600279796e-06, + "loss": 0.91948426, + "num_input_tokens_seen": 810230, + "step": 34, + "time_per_iteration": 2.7650465965270996 + }, + { + "auxiliary_loss_clip": 0.02501756, + "auxiliary_loss_mlp": 0.01324716, + "balance_loss_clip": 1.50747418, + "balance_loss_mlp": 1.07313645, + "epoch": 0.004208501172368184, + "flos": 23258587599360.0, + "grad_norm": 2.1435482384685067, + "language_loss": 1.03435397, + "learning_rate": 2.5756575039679493e-06, + "loss": 1.07261872, + "num_input_tokens_seen": 829780, + "step": 35, + "time_per_iteration": 2.741372585296631 + }, + { + "auxiliary_loss_clip": 0.02464117, + "auxiliary_loss_mlp": 0.01351179, + "balance_loss_clip": 1.49650097, + "balance_loss_mlp": 1.09998131, + "epoch": 0.0043287440630072746, + "flos": 17312104062720.0, + "grad_norm": 1.9700471122117738, + "language_loss": 0.94982445, + "learning_rate": 2.5960657816942747e-06, + "loss": 0.98797739, + "num_input_tokens_seen": 848695, + "step": 36, + "time_per_iteration": 2.711890935897827 + }, + { + "auxiliary_loss_clip": 0.02303775, + "auxiliary_loss_mlp": 0.01393126, + "balance_loss_clip": 1.57803583, + "balance_loss_mlp": 1.25198257, + "epoch": 0.004448986953646365, + "flos": 53092491160320.0, + "grad_norm": 1.398194431895382, + "language_loss": 0.60974598, + "learning_rate": 2.6159148575788668e-06, + "loss": 0.64671493, + "num_input_tokens_seen": 906730, + "step": 37, + "time_per_iteration": 3.112426996231079 + }, + { + "auxiliary_loss_clip": 0.02412854, + "auxiliary_loss_mlp": 0.013603, + "balance_loss_clip": 1.48573279, + "balance_loss_mlp": 1.1108191, + "epoch": 0.004569229844285457, + "flos": 13444165866240.0, + "grad_norm": 2.5257416776216592, + "language_loss": 0.98745996, + "learning_rate": 2.635234561171e-06, + "loss": 1.02519155, + "num_input_tokens_seen": 925125, + "step": 38, + "time_per_iteration": 2.670102596282959 + }, + { + "auxiliary_loss_clip": 0.02388864, + "auxiliary_loss_mlp": 0.01327486, + "balance_loss_clip": 1.47793186, + "balance_loss_mlp": 1.09345436, + "epoch": 0.0046894727349245475, + "flos": 16209609966720.0, + "grad_norm": 2.2874117518999193, + "language_loss": 0.94215786, + "learning_rate": 2.6540523970949877e-06, + "loss": 0.97932136, + "num_input_tokens_seen": 939970, + "step": 39, + "time_per_iteration": 2.7218737602233887 + }, + { + "auxiliary_loss_clip": 0.02357519, + "auxiliary_loss_mlp": 0.01333414, + "balance_loss_clip": 1.47663307, + "balance_loss_mlp": 1.09728432, + "epoch": 0.004809715625563638, + "flos": 23914244505600.0, + "grad_norm": 2.6644391722813188, + "language_loss": 0.92516059, + "learning_rate": 2.6723937805519533e-06, + "loss": 0.96206993, + "num_input_tokens_seen": 957470, + "step": 40, + "time_per_iteration": 2.7147934436798096 + }, + { + "auxiliary_loss_clip": 0.02345942, + "auxiliary_loss_mlp": 0.01304425, + "balance_loss_clip": 1.46480012, + "balance_loss_mlp": 1.07916677, + "epoch": 0.00492995851620273, + "flos": 20773030273920.0, + "grad_norm": 2.171423286546705, + "language_loss": 0.92990512, + "learning_rate": 2.690282243737839e-06, + "loss": 0.96640879, + "num_input_tokens_seen": 976405, + "step": 41, + "time_per_iteration": 2.6997878551483154 + }, + { + "auxiliary_loss_clip": 0.02310364, + "auxiliary_loss_mlp": 0.01327203, + "balance_loss_clip": 1.45331836, + "balance_loss_mlp": 1.09736788, + "epoch": 0.0050502014068418205, + "flos": 20338655103360.0, + "grad_norm": 2.662969012086038, + "language_loss": 0.99230605, + "learning_rate": 2.7077396173840807e-06, + "loss": 1.02868176, + "num_input_tokens_seen": 994690, + "step": 42, + "time_per_iteration": 3.478790521621704 + }, + { + "auxiliary_loss_clip": 0.02285866, + "auxiliary_loss_mlp": 0.01314746, + "balance_loss_clip": 1.44597411, + "balance_loss_mlp": 1.09521067, + "epoch": 0.005170444297480911, + "flos": 25994872834560.0, + "grad_norm": 2.6198556118564804, + "language_loss": 0.92591667, + "learning_rate": 2.7247861909342594e-06, + "loss": 0.96192282, + "num_input_tokens_seen": 1015615, + "step": 43, + "time_per_iteration": 4.0540406703948975 + }, + { + "auxiliary_loss_clip": 0.02282108, + "auxiliary_loss_mlp": 0.01309658, + "balance_loss_clip": 1.44477665, + "balance_loss_mlp": 1.09345984, + "epoch": 0.005290687188120003, + "flos": 20954055841920.0, + "grad_norm": 2.286658410201937, + "language_loss": 0.8293969, + "learning_rate": 2.7414408543044743e-06, + "loss": 0.86531454, + "num_input_tokens_seen": 1031255, + "step": 44, + "time_per_iteration": 2.6883347034454346 + }, + { + "auxiliary_loss_clip": 0.02231397, + "auxiliary_loss_mlp": 0.01332251, + "balance_loss_clip": 1.43015862, + "balance_loss_mlp": 1.11261952, + "epoch": 0.005410930078759093, + "flos": 15851401585920.0, + "grad_norm": 10.068546149385137, + "language_loss": 0.79058111, + "learning_rate": 2.7577212237113157e-06, + "loss": 0.82621753, + "num_input_tokens_seen": 1048295, + "step": 45, + "time_per_iteration": 2.682742118835449 + }, + { + "auxiliary_loss_clip": 0.02217989, + "auxiliary_loss_mlp": 0.01308043, + "balance_loss_clip": 1.42400014, + "balance_loss_mlp": 1.09499252, + "epoch": 0.005531172969398184, + "flos": 21104988791040.0, + "grad_norm": 2.1281741110699595, + "language_loss": 1.04237497, + "learning_rate": 2.7736437536690466e-06, + "loss": 1.07763529, + "num_input_tokens_seen": 1067925, + "step": 46, + "time_per_iteration": 2.7103476524353027 + }, + { + "auxiliary_loss_clip": 0.02207815, + "auxiliary_loss_mlp": 0.01276385, + "balance_loss_clip": 1.42449737, + "balance_loss_mlp": 1.06781626, + "epoch": 0.005651415860037276, + "flos": 20844887431680.0, + "grad_norm": 2.3266364602082144, + "language_loss": 1.07720137, + "learning_rate": 2.789223836941131e-06, + "loss": 1.1120435, + "num_input_tokens_seen": 1088060, + "step": 47, + "time_per_iteration": 2.6737117767333984 + }, + { + "auxiliary_loss_clip": 0.02172153, + "auxiliary_loss_mlp": 0.01285319, + "balance_loss_clip": 1.41179073, + "balance_loss_mlp": 1.08085096, + "epoch": 0.005771658750676366, + "flos": 13260195383040.0, + "grad_norm": 2.388761675367142, + "language_loss": 1.08692598, + "learning_rate": 2.8044758939680847e-06, + "loss": 1.12150085, + "num_input_tokens_seen": 1104130, + "step": 48, + "time_per_iteration": 2.612104892730713 + }, + { + "auxiliary_loss_clip": 0.02147161, + "auxiliary_loss_mlp": 0.01282264, + "balance_loss_clip": 1.41145968, + "balance_loss_mlp": 1.07655692, + "epoch": 0.005891901641315457, + "flos": 24425396997120.0, + "grad_norm": 7.205833187648408, + "language_loss": 1.02130783, + "learning_rate": 2.8194134530738863e-06, + "loss": 1.05560207, + "num_input_tokens_seen": 1122900, + "step": 49, + "time_per_iteration": 2.681382894515991 + }, + { + "auxiliary_loss_clip": 0.02139021, + "auxiliary_loss_mlp": 0.01290139, + "balance_loss_clip": 1.40579212, + "balance_loss_mlp": 1.09387314, + "epoch": 0.006012144531954548, + "flos": 23076197314560.0, + "grad_norm": 2.558727094572549, + "language_loss": 0.90087247, + "learning_rate": 2.834049222568994e-06, + "loss": 0.93516409, + "num_input_tokens_seen": 1140250, + "step": 50, + "time_per_iteration": 2.67350697517395 + }, + { + "auxiliary_loss_clip": 0.02135085, + "auxiliary_loss_mlp": 0.01255022, + "balance_loss_clip": 1.40258718, + "balance_loss_mlp": 1.06400096, + "epoch": 0.006132387422593639, + "flos": 22528775064960.0, + "grad_norm": 1.8717953273445211, + "language_loss": 0.92423445, + "learning_rate": 2.848395155712969e-06, + "loss": 0.95813555, + "num_input_tokens_seen": 1160470, + "step": 51, + "time_per_iteration": 2.6298584938049316 + }, + { + "auxiliary_loss_clip": 0.02118313, + "auxiliary_loss_mlp": 0.01293401, + "balance_loss_clip": 1.40357351, + "balance_loss_mlp": 1.09875619, + "epoch": 0.00625263031323273, + "flos": 27628340751360.0, + "grad_norm": 2.312437719329922, + "language_loss": 0.97569245, + "learning_rate": 2.8624625093687977e-06, + "loss": 1.00980961, + "num_input_tokens_seen": 1177605, + "step": 52, + "time_per_iteration": 2.705409288406372 + }, + { + "auxiliary_loss_clip": 0.02100515, + "auxiliary_loss_mlp": 0.0125987, + "balance_loss_clip": 1.39357424, + "balance_loss_mlp": 1.07705033, + "epoch": 0.006372873203871821, + "flos": 23110671392640.0, + "grad_norm": 2.413575028050906, + "language_loss": 0.88998222, + "learning_rate": 2.876261897070029e-06, + "loss": 0.92358613, + "num_input_tokens_seen": 1197735, + "step": 53, + "time_per_iteration": 2.630678415298462 + }, + { + "auxiliary_loss_clip": 0.02099512, + "auxiliary_loss_mlp": 0.01277437, + "balance_loss_clip": 1.39772594, + "balance_loss_mlp": 1.09194767, + "epoch": 0.006493116094510912, + "flos": 22856028900480.0, + "grad_norm": 2.470427108327774, + "language_loss": 0.92309314, + "learning_rate": 2.889803337127447e-06, + "loss": 0.95686257, + "num_input_tokens_seen": 1216335, + "step": 54, + "time_per_iteration": 2.6569180488586426 + }, + { + "auxiliary_loss_clip": 0.02068233, + "auxiliary_loss_mlp": 0.01296732, + "balance_loss_clip": 1.38537741, + "balance_loss_mlp": 1.10141969, + "epoch": 0.006613358985150003, + "flos": 23071708114560.0, + "grad_norm": 4.798285227810707, + "language_loss": 0.84695476, + "learning_rate": 2.903096296321516e-06, + "loss": 0.88060445, + "num_input_tokens_seen": 1234480, + "step": 55, + "time_per_iteration": 2.6526994705200195 + }, + { + "auxiliary_loss_clip": 0.02069055, + "auxiliary_loss_mlp": 0.01250531, + "balance_loss_clip": 1.38686192, + "balance_loss_mlp": 1.07524538, + "epoch": 0.006733601875789094, + "flos": 26537662229760.0, + "grad_norm": 1.9678578099806474, + "language_loss": 0.91687977, + "learning_rate": 2.9161497296578907e-06, + "loss": 0.95007569, + "num_input_tokens_seen": 1253870, + "step": 56, + "time_per_iteration": 2.70420241355896 + }, + { + "auxiliary_loss_clip": 0.02049681, + "auxiliary_loss_mlp": 0.01253205, + "balance_loss_clip": 1.38082504, + "balance_loss_mlp": 1.07563078, + "epoch": 0.006853844766428185, + "flos": 15523178083200.0, + "grad_norm": 2.169600513230549, + "language_loss": 0.85848916, + "learning_rate": 2.928972116604173e-06, + "loss": 0.891518, + "num_input_tokens_seen": 1270145, + "step": 57, + "time_per_iteration": 2.5955557823181152 + }, + { + "auxiliary_loss_clip": 0.02021701, + "auxiliary_loss_mlp": 0.01234647, + "balance_loss_clip": 1.37206626, + "balance_loss_mlp": 1.0678494, + "epoch": 0.006974087657067276, + "flos": 24243760897920.0, + "grad_norm": 5.973243230344974, + "language_loss": 1.02024364, + "learning_rate": 2.9415714941751377e-06, + "loss": 1.05280709, + "num_input_tokens_seen": 1291365, + "step": 58, + "time_per_iteration": 2.651109218597412 + }, + { + "auxiliary_loss_clip": 0.02038122, + "auxiliary_loss_mlp": 0.01256678, + "balance_loss_clip": 1.37371969, + "balance_loss_mlp": 1.08892632, + "epoch": 0.007094330547706367, + "flos": 25772513690880.0, + "grad_norm": 1.8730097132514536, + "language_loss": 0.93521208, + "learning_rate": 2.9539554871897396e-06, + "loss": 0.96816009, + "num_input_tokens_seen": 1311535, + "step": 59, + "time_per_iteration": 2.6681530475616455 + }, + { + "auxiliary_loss_clip": 0.02006071, + "auxiliary_loss_mlp": 0.01242539, + "balance_loss_clip": 1.36593437, + "balance_loss_mlp": 1.07907915, + "epoch": 0.007214573438345458, + "flos": 21319015979520.0, + "grad_norm": 2.527464112918325, + "language_loss": 0.97403991, + "learning_rate": 2.9661313359851253e-06, + "loss": 1.00652599, + "num_input_tokens_seen": 1329420, + "step": 60, + "time_per_iteration": 2.6241652965545654 + }, + { + "auxiliary_loss_clip": 0.01984811, + "auxiliary_loss_mlp": 0.01236678, + "balance_loss_clip": 1.36365652, + "balance_loss_mlp": 1.07741404, + "epoch": 0.007334816328984549, + "flos": 24937088192640.0, + "grad_norm": 6.292004761511977, + "language_loss": 0.94155359, + "learning_rate": 2.978105921839922e-06, + "loss": 0.97376847, + "num_input_tokens_seen": 1349965, + "step": 61, + "time_per_iteration": 2.7245492935180664 + }, + { + "auxiliary_loss_clip": 0.01971468, + "auxiliary_loss_mlp": 0.01249403, + "balance_loss_clip": 1.35985589, + "balance_loss_mlp": 1.08889973, + "epoch": 0.00745505921962364, + "flos": 18510586277760.0, + "grad_norm": 2.265863591350538, + "language_loss": 0.72096038, + "learning_rate": 2.9898857903302893e-06, + "loss": 0.75316906, + "num_input_tokens_seen": 1368915, + "step": 62, + "time_per_iteration": 2.6413767337799072 + }, + { + "auxiliary_loss_clip": 0.01974162, + "auxiliary_loss_mlp": 0.01254663, + "balance_loss_clip": 1.35824609, + "balance_loss_mlp": 1.08986795, + "epoch": 0.007575302110262731, + "flos": 18477656484480.0, + "grad_norm": 3.620721746446778, + "language_loss": 0.8797034, + "learning_rate": 3.001477172817253e-06, + "loss": 0.9119916, + "num_input_tokens_seen": 1386805, + "step": 63, + "time_per_iteration": 2.5936529636383057 + }, + { + "auxiliary_loss_clip": 0.01950344, + "auxiliary_loss_mlp": 0.01226837, + "balance_loss_clip": 1.35019684, + "balance_loss_mlp": 1.07834983, + "epoch": 0.007695545000901822, + "flos": 24973178382720.0, + "grad_norm": 2.721068187518123, + "language_loss": 0.9643023, + "learning_rate": 3.012886006241894e-06, + "loss": 0.99607414, + "num_input_tokens_seen": 1406190, + "step": 64, + "time_per_iteration": 2.636819839477539 + }, + { + "auxiliary_loss_clip": 0.01955315, + "auxiliary_loss_mlp": 0.01230422, + "balance_loss_clip": 1.35175109, + "balance_loss_mlp": 1.07440138, + "epoch": 0.007815787891540913, + "flos": 21324223451520.0, + "grad_norm": 2.0721214351494353, + "language_loss": 0.88308674, + "learning_rate": 3.0241179513858383e-06, + "loss": 0.91494405, + "num_input_tokens_seen": 1425500, + "step": 65, + "time_per_iteration": 2.624528646469116 + }, + { + "auxiliary_loss_clip": 0.01937021, + "auxiliary_loss_mlp": 0.01255913, + "balance_loss_clip": 1.34111071, + "balance_loss_mlp": 1.09264445, + "epoch": 0.007936030782180003, + "flos": 21575777374080.0, + "grad_norm": 2.369998740161197, + "language_loss": 0.87739629, + "learning_rate": 3.035178409737647e-06, + "loss": 0.9093256, + "num_input_tokens_seen": 1442950, + "step": 66, + "time_per_iteration": 2.6210875511169434 + }, + { + "auxiliary_loss_clip": 0.01916913, + "auxiliary_loss_mlp": 0.01219577, + "balance_loss_clip": 1.33460462, + "balance_loss_mlp": 1.08339238, + "epoch": 0.008056273672819095, + "flos": 20120785159680.0, + "grad_norm": 2.271438418590118, + "language_loss": 0.88988554, + "learning_rate": 3.046072539090907e-06, + "loss": 0.92125046, + "num_input_tokens_seen": 1460915, + "step": 67, + "time_per_iteration": 2.658783435821533 + }, + { + "auxiliary_loss_clip": 0.01911898, + "auxiliary_loss_mlp": 0.01215404, + "balance_loss_clip": 1.33377588, + "balance_loss_mlp": 1.07292461, + "epoch": 0.008176516563458186, + "flos": 18333116156160.0, + "grad_norm": 2.5127764173045657, + "language_loss": 1.04831719, + "learning_rate": 3.056805267986779e-06, + "loss": 1.07959032, + "num_input_tokens_seen": 1478385, + "step": 68, + "time_per_iteration": 2.6134274005889893 + }, + { + "auxiliary_loss_clip": 0.01894127, + "auxiliary_loss_mlp": 0.01219922, + "balance_loss_clip": 1.32747531, + "balance_loss_mlp": 1.08106661, + "epoch": 0.008296759454097276, + "flos": 21872076664320.0, + "grad_norm": 2.294801356368828, + "language_loss": 0.95265126, + "learning_rate": 3.0673813091022194e-06, + "loss": 0.98379177, + "num_input_tokens_seen": 1497605, + "step": 69, + "time_per_iteration": 3.514554500579834 + }, + { + "auxiliary_loss_clip": 0.01747915, + "auxiliary_loss_mlp": 0.0119753, + "balance_loss_clip": 1.3496182, + "balance_loss_mlp": 1.13153541, + "epoch": 0.008417002344736368, + "flos": 63408228036480.0, + "grad_norm": 1.2700818634017939, + "language_loss": 0.62012094, + "learning_rate": 3.0778051716749317e-06, + "loss": 0.64957529, + "num_input_tokens_seen": 1561150, + "step": 70, + "time_per_iteration": 4.728844165802002 + }, + { + "auxiliary_loss_clip": 0.01866523, + "auxiliary_loss_mlp": 0.01209449, + "balance_loss_clip": 1.30933714, + "balance_loss_mlp": 1.07231104, + "epoch": 0.008537245235375458, + "flos": 22966454286720.0, + "grad_norm": 2.358332988880353, + "language_loss": 0.90341187, + "learning_rate": 3.0880811730470094e-06, + "loss": 0.93417162, + "num_input_tokens_seen": 1580605, + "step": 71, + "time_per_iteration": 2.62558913230896 + }, + { + "auxiliary_loss_clip": 0.01714101, + "auxiliary_loss_mlp": 0.01158795, + "balance_loss_clip": 1.32948065, + "balance_loss_mlp": 1.09737861, + "epoch": 0.008657488126014549, + "flos": 61984046712960.0, + "grad_norm": 1.1377262086437336, + "language_loss": 0.58551824, + "learning_rate": 3.098213449401257e-06, + "loss": 0.6142472, + "num_input_tokens_seen": 1647535, + "step": 72, + "time_per_iteration": 3.1147620677948 + }, + { + "auxiliary_loss_clip": 0.01856985, + "auxiliary_loss_mlp": 0.01213376, + "balance_loss_clip": 1.30769348, + "balance_loss_mlp": 1.08215058, + "epoch": 0.00877773101665364, + "flos": 30296791152000.0, + "grad_norm": 2.159606984850392, + "language_loss": 0.98988783, + "learning_rate": 3.1082059657570015e-06, + "loss": 1.0205915, + "num_input_tokens_seen": 1666770, + "step": 73, + "time_per_iteration": 2.6980302333831787 + }, + { + "auxiliary_loss_clip": 0.01828996, + "auxiliary_loss_mlp": 0.01201452, + "balance_loss_clip": 1.29911363, + "balance_loss_mlp": 1.06755626, + "epoch": 0.00889797390729273, + "flos": 23514056104320.0, + "grad_norm": 2.902644079794967, + "language_loss": 0.96767902, + "learning_rate": 3.1180625252858496e-06, + "loss": 0.99798346, + "num_input_tokens_seen": 1685200, + "step": 74, + "time_per_iteration": 2.63293194770813 + }, + { + "auxiliary_loss_clip": 0.01812889, + "auxiliary_loss_mlp": 0.01209509, + "balance_loss_clip": 1.28972054, + "balance_loss_mlp": 1.08362412, + "epoch": 0.009018216797931822, + "flos": 23075838178560.0, + "grad_norm": 2.637849473753041, + "language_loss": 0.80078471, + "learning_rate": 3.1277867780021663e-06, + "loss": 0.83100867, + "num_input_tokens_seen": 1701835, + "step": 75, + "time_per_iteration": 2.6503360271453857 + }, + { + "auxiliary_loss_clip": 0.01791304, + "auxiliary_loss_mlp": 0.01180957, + "balance_loss_clip": 1.28232956, + "balance_loss_mlp": 1.06537127, + "epoch": 0.009138459688570914, + "flos": 15918877284480.0, + "grad_norm": 2.788882816295574, + "language_loss": 0.95595336, + "learning_rate": 3.1373822288779824e-06, + "loss": 0.98567593, + "num_input_tokens_seen": 1718415, + "step": 76, + "time_per_iteration": 2.6364893913269043 + }, + { + "auxiliary_loss_clip": 0.01790247, + "auxiliary_loss_mlp": 0.01211052, + "balance_loss_clip": 1.28385448, + "balance_loss_mlp": 1.08821845, + "epoch": 0.009258702579210003, + "flos": 27016531372800.0, + "grad_norm": 3.180093307409489, + "language_loss": 0.79535246, + "learning_rate": 3.1468522454274533e-06, + "loss": 0.82536548, + "num_input_tokens_seen": 1738770, + "step": 77, + "time_per_iteration": 2.7492098808288574 + }, + { + "auxiliary_loss_clip": 0.01780251, + "auxiliary_loss_mlp": 0.0119471, + "balance_loss_clip": 1.27814174, + "balance_loss_mlp": 1.07387948, + "epoch": 0.009378945469849095, + "flos": 26903196984960.0, + "grad_norm": 1.9469720948869846, + "language_loss": 0.91859007, + "learning_rate": 3.15620006480197e-06, + "loss": 0.9483397, + "num_input_tokens_seen": 1758040, + "step": 78, + "time_per_iteration": 2.665693998336792 + }, + { + "auxiliary_loss_clip": 0.01776904, + "auxiliary_loss_mlp": 0.01187715, + "balance_loss_clip": 1.27530289, + "balance_loss_mlp": 1.06774259, + "epoch": 0.009499188360488187, + "flos": 35694236327040.0, + "grad_norm": 4.061064857383708, + "language_loss": 0.74933708, + "learning_rate": 3.1654288004333087e-06, + "loss": 0.77898324, + "num_input_tokens_seen": 1776705, + "step": 79, + "time_per_iteration": 2.714592933654785 + }, + { + "auxiliary_loss_clip": 0.01755734, + "auxiliary_loss_mlp": 0.01180228, + "balance_loss_clip": 1.2690351, + "balance_loss_mlp": 1.07036471, + "epoch": 0.009619431251127276, + "flos": 21503201944320.0, + "grad_norm": 2.4294124591910675, + "language_loss": 0.76052099, + "learning_rate": 3.1745414482589353e-06, + "loss": 0.78988063, + "num_input_tokens_seen": 1795915, + "step": 80, + "time_per_iteration": 2.632139205932617 + }, + { + "auxiliary_loss_clip": 0.01745899, + "auxiliary_loss_mlp": 0.01173838, + "balance_loss_clip": 1.26470017, + "balance_loss_mlp": 1.06268764, + "epoch": 0.009739674141766368, + "flos": 17421056991360.0, + "grad_norm": 3.186390088460552, + "language_loss": 0.87151778, + "learning_rate": 3.1835408925606204e-06, + "loss": 0.90071511, + "num_input_tokens_seen": 1814055, + "step": 81, + "time_per_iteration": 2.6621835231781006 + }, + { + "auxiliary_loss_clip": 0.01725373, + "auxiliary_loss_mlp": 0.01186511, + "balance_loss_clip": 1.25754762, + "balance_loss_mlp": 1.07650506, + "epoch": 0.00985991703240546, + "flos": 27527109246720.0, + "grad_norm": 5.285462304250921, + "language_loss": 0.89346701, + "learning_rate": 3.1924299114448214e-06, + "loss": 0.92258584, + "num_input_tokens_seen": 1834535, + "step": 82, + "time_per_iteration": 2.671375274658203 + }, + { + "auxiliary_loss_clip": 0.01735232, + "auxiliary_loss_mlp": 0.01187334, + "balance_loss_clip": 1.2621932, + "balance_loss_mlp": 1.07756591, + "epoch": 0.00998015992304455, + "flos": 13808084509440.0, + "grad_norm": 2.534205593001497, + "language_loss": 0.83327866, + "learning_rate": 3.2012111819909055e-06, + "loss": 0.86250436, + "num_input_tokens_seen": 1851865, + "step": 83, + "time_per_iteration": 2.600539445877075 + }, + { + "auxiliary_loss_clip": 0.01723025, + "auxiliary_loss_mlp": 0.01179222, + "balance_loss_clip": 1.25518084, + "balance_loss_mlp": 1.07226741, + "epoch": 0.010100402813683641, + "flos": 20191385341440.0, + "grad_norm": 2.138824840512966, + "language_loss": 0.95058811, + "learning_rate": 3.2098872850910627e-06, + "loss": 0.97961056, + "num_input_tokens_seen": 1868540, + "step": 84, + "time_per_iteration": 2.599152088165283 + }, + { + "auxiliary_loss_clip": 0.01720412, + "auxiliary_loss_mlp": 0.01179937, + "balance_loss_clip": 1.25694168, + "balance_loss_mlp": 1.07703519, + "epoch": 0.010220645704322733, + "flos": 17201642762880.0, + "grad_norm": 2.1010529484399623, + "language_loss": 0.89264512, + "learning_rate": 3.2184607100038194e-06, + "loss": 0.92164862, + "num_input_tokens_seen": 1887180, + "step": 85, + "time_per_iteration": 2.614490270614624 + }, + { + "auxiliary_loss_clip": 0.01718053, + "auxiliary_loss_mlp": 0.01182105, + "balance_loss_clip": 1.25722146, + "balance_loss_mlp": 1.08063459, + "epoch": 0.010340888594961822, + "flos": 21470415805440.0, + "grad_norm": 5.072650567508388, + "language_loss": 0.93178141, + "learning_rate": 3.2269338586412414e-06, + "loss": 0.960783, + "num_input_tokens_seen": 1904765, + "step": 86, + "time_per_iteration": 2.6865406036376953 + }, + { + "auxiliary_loss_clip": 0.01704298, + "auxiliary_loss_mlp": 0.01173039, + "balance_loss_clip": 1.24989474, + "balance_loss_mlp": 1.0777669, + "epoch": 0.010461131485600914, + "flos": 23002831785600.0, + "grad_norm": 2.5050926268469746, + "language_loss": 0.96704745, + "learning_rate": 3.2353090496083106e-06, + "loss": 0.99582082, + "num_input_tokens_seen": 1922600, + "step": 87, + "time_per_iteration": 2.676307439804077 + }, + { + "auxiliary_loss_clip": 0.01681535, + "auxiliary_loss_mlp": 0.01172089, + "balance_loss_clip": 1.24031413, + "balance_loss_mlp": 1.082968, + "epoch": 0.010581374376240005, + "flos": 33546850571520.0, + "grad_norm": 1.9442446928374768, + "language_loss": 0.81370449, + "learning_rate": 3.2435885220114572e-06, + "loss": 0.84224069, + "num_input_tokens_seen": 1943950, + "step": 88, + "time_per_iteration": 2.7299387454986572 + }, + { + "auxiliary_loss_clip": 0.0169003, + "auxiliary_loss_mlp": 0.01156504, + "balance_loss_clip": 1.24699545, + "balance_loss_mlp": 1.06290126, + "epoch": 0.010701617266879095, + "flos": 21763087822080.0, + "grad_norm": 3.6004270120009805, + "language_loss": 0.93844104, + "learning_rate": 3.2517744390519113e-06, + "loss": 0.96690637, + "num_input_tokens_seen": 1962815, + "step": 89, + "time_per_iteration": 2.6142044067382812 + }, + { + "auxiliary_loss_clip": 0.01676169, + "auxiliary_loss_mlp": 0.01156133, + "balance_loss_clip": 1.23301625, + "balance_loss_mlp": 1.06782269, + "epoch": 0.010821860157518187, + "flos": 19060199256960.0, + "grad_norm": 2.431057334734209, + "language_loss": 0.75148028, + "learning_rate": 3.259868891418298e-06, + "loss": 0.77980328, + "num_input_tokens_seen": 1980580, + "step": 90, + "time_per_iteration": 2.592013359069824 + }, + { + "auxiliary_loss_clip": 0.01684397, + "auxiliary_loss_mlp": 0.01192342, + "balance_loss_clip": 1.24279797, + "balance_loss_mlp": 1.10045588, + "epoch": 0.010942103048157278, + "flos": 25447378757760.0, + "grad_norm": 2.02081501395768, + "language_loss": 0.8498618, + "learning_rate": 3.2678739004917757e-06, + "loss": 0.87862921, + "num_input_tokens_seen": 2000315, + "step": 91, + "time_per_iteration": 2.637687921524048 + }, + { + "auxiliary_loss_clip": 0.0166731, + "auxiliary_loss_mlp": 0.01170394, + "balance_loss_clip": 1.23643279, + "balance_loss_mlp": 1.08461094, + "epoch": 0.011062345938796368, + "flos": 27493928058240.0, + "grad_norm": 3.0767886788724335, + "language_loss": 0.92124963, + "learning_rate": 3.275791421376029e-06, + "loss": 0.94962668, + "num_input_tokens_seen": 2023760, + "step": 92, + "time_per_iteration": 2.6677346229553223 + }, + { + "auxiliary_loss_clip": 0.01656367, + "auxiliary_loss_mlp": 0.01147862, + "balance_loss_clip": 1.2288785, + "balance_loss_mlp": 1.0688982, + "epoch": 0.01118258882943546, + "flos": 16071210864000.0, + "grad_norm": 2.4559507604207944, + "language_loss": 0.96172017, + "learning_rate": 3.2836233457634622e-06, + "loss": 0.98976243, + "num_input_tokens_seen": 2041895, + "step": 93, + "time_per_iteration": 2.5751163959503174 + }, + { + "auxiliary_loss_clip": 0.01653702, + "auxiliary_loss_mlp": 0.01183245, + "balance_loss_clip": 1.22804332, + "balance_loss_mlp": 1.08911729, + "epoch": 0.011302831720074551, + "flos": 20668602458880.0, + "grad_norm": 3.2201338679645133, + "language_loss": 0.85535169, + "learning_rate": 3.2913715046481135e-06, + "loss": 0.88372111, + "num_input_tokens_seen": 2061640, + "step": 94, + "time_per_iteration": 2.6139183044433594 + }, + { + "auxiliary_loss_clip": 0.01650393, + "auxiliary_loss_mlp": 0.0116086, + "balance_loss_clip": 1.22666478, + "balance_loss_mlp": 1.08041704, + "epoch": 0.011423074610713641, + "flos": 13072238490240.0, + "grad_norm": 6.133333790693309, + "language_loss": 0.88872576, + "learning_rate": 3.299037670895023e-06, + "loss": 0.91683829, + "num_input_tokens_seen": 2078255, + "step": 95, + "time_per_iteration": 2.619093418121338 + }, + { + "auxiliary_loss_clip": 0.01652527, + "auxiliary_loss_mlp": 0.01147409, + "balance_loss_clip": 1.23226905, + "balance_loss_mlp": 1.06486893, + "epoch": 0.011543317501352733, + "flos": 30335646689280.0, + "grad_norm": 3.7049022681075114, + "language_loss": 0.80362546, + "learning_rate": 3.3066235616750667e-06, + "loss": 0.83162487, + "num_input_tokens_seen": 2099490, + "step": 96, + "time_per_iteration": 2.6571922302246094 + }, + { + "auxiliary_loss_clip": 0.01631608, + "auxiliary_loss_mlp": 0.01143059, + "balance_loss_clip": 1.21968937, + "balance_loss_mlp": 1.06500113, + "epoch": 0.011663560391991824, + "flos": 15522962601600.0, + "grad_norm": 4.102507524041367, + "language_loss": 0.9242425, + "learning_rate": 3.3141308407736276e-06, + "loss": 0.95198917, + "num_input_tokens_seen": 2116125, + "step": 97, + "time_per_iteration": 4.277625560760498 + }, + { + "auxiliary_loss_clip": 0.01637193, + "auxiliary_loss_mlp": 0.01148706, + "balance_loss_clip": 1.21698785, + "balance_loss_mlp": 1.07117188, + "epoch": 0.011783803282630914, + "flos": 19902125116800.0, + "grad_norm": 2.409503682699795, + "language_loss": 0.86788988, + "learning_rate": 3.321561120780869e-06, + "loss": 0.89574891, + "num_input_tokens_seen": 2134835, + "step": 98, + "time_per_iteration": 3.4901506900787354 + }, + { + "auxiliary_loss_clip": 0.01626835, + "auxiliary_loss_mlp": 0.01143323, + "balance_loss_clip": 1.21967435, + "balance_loss_mlp": 1.07394278, + "epoch": 0.011904046173270006, + "flos": 22340674517760.0, + "grad_norm": 12.859579686898757, + "language_loss": 1.0142858, + "learning_rate": 3.3289159651708192e-06, + "loss": 1.04198742, + "num_input_tokens_seen": 2152410, + "step": 99, + "time_per_iteration": 2.60176420211792 + }, + { + "auxiliary_loss_clip": 0.01625277, + "auxiliary_loss_mlp": 0.01142614, + "balance_loss_clip": 1.21657133, + "balance_loss_mlp": 1.06670177, + "epoch": 0.012024289063909096, + "flos": 19100060375040.0, + "grad_norm": 1.9609932207630814, + "language_loss": 0.97639257, + "learning_rate": 3.3361968902759768e-06, + "loss": 1.00407147, + "num_input_tokens_seen": 2172090, + "step": 100, + "time_per_iteration": 2.5887444019317627 + }, + { + "auxiliary_loss_clip": 0.01619465, + "auxiliary_loss_mlp": 0.01132177, + "balance_loss_clip": 1.21476483, + "balance_loss_mlp": 1.06503844, + "epoch": 0.012144531954548187, + "flos": 15012205159680.0, + "grad_norm": 2.274671944173216, + "language_loss": 0.93972164, + "learning_rate": 3.343405367163663e-06, + "loss": 0.96723807, + "num_input_tokens_seen": 2189020, + "step": 101, + "time_per_iteration": 2.5869266986846924 + }, + { + "auxiliary_loss_clip": 0.01623648, + "auxiliary_loss_mlp": 0.01137326, + "balance_loss_clip": 1.21493495, + "balance_loss_mlp": 1.06751716, + "epoch": 0.012264774845187279, + "flos": 15122020014720.0, + "grad_norm": 13.200211338057656, + "language_loss": 0.81224948, + "learning_rate": 3.350542823419951e-06, + "loss": 0.83985919, + "num_input_tokens_seen": 2205620, + "step": 102, + "time_per_iteration": 2.601289987564087 + }, + { + "auxiliary_loss_clip": 0.01617352, + "auxiliary_loss_mlp": 0.01152063, + "balance_loss_clip": 1.20911241, + "balance_loss_mlp": 1.08277845, + "epoch": 0.012385017735826368, + "flos": 13949248959360.0, + "grad_norm": 3.5181414275455443, + "language_loss": 0.87406003, + "learning_rate": 3.3576106448465615e-06, + "loss": 0.90175414, + "num_input_tokens_seen": 2219000, + "step": 103, + "time_per_iteration": 2.575157880783081 + }, + { + "auxiliary_loss_clip": 0.01606251, + "auxiliary_loss_mlp": 0.01139622, + "balance_loss_clip": 1.20703244, + "balance_loss_mlp": 1.06952691, + "epoch": 0.01250526062646546, + "flos": 23623260428160.0, + "grad_norm": 2.010437276769021, + "language_loss": 0.88113737, + "learning_rate": 3.3646101770757797e-06, + "loss": 0.9085961, + "num_input_tokens_seen": 2237790, + "step": 104, + "time_per_iteration": 2.609377861022949 + }, + { + "auxiliary_loss_clip": 0.01599122, + "auxiliary_loss_mlp": 0.0114088, + "balance_loss_clip": 1.20334709, + "balance_loss_mlp": 1.06735206, + "epoch": 0.012625503517104552, + "flos": 34640078958720.0, + "grad_norm": 1.7085932268712503, + "language_loss": 0.85761094, + "learning_rate": 3.371542727108104e-06, + "loss": 0.88501102, + "num_input_tokens_seen": 2259965, + "step": 105, + "time_per_iteration": 2.728022813796997 + }, + { + "auxiliary_loss_clip": 0.01602075, + "auxiliary_loss_mlp": 0.01180103, + "balance_loss_clip": 1.20619822, + "balance_loss_mlp": 1.1089586, + "epoch": 0.012745746407743641, + "flos": 17821891837440.0, + "grad_norm": 2.6058663153912716, + "language_loss": 0.90225899, + "learning_rate": 3.3784095647770114e-06, + "loss": 0.93008077, + "num_input_tokens_seen": 2278610, + "step": 106, + "time_per_iteration": 2.6113648414611816 + }, + { + "auxiliary_loss_clip": 0.01592173, + "auxiliary_loss_mlp": 0.01142748, + "balance_loss_clip": 1.19625306, + "balance_loss_mlp": 1.07260489, + "epoch": 0.012865989298382733, + "flos": 20595057361920.0, + "grad_norm": 2.5375172503219687, + "language_loss": 0.88876402, + "learning_rate": 3.3852119241449547e-06, + "loss": 0.91611332, + "num_input_tokens_seen": 2297730, + "step": 107, + "time_per_iteration": 2.6060562133789062 + }, + { + "auxiliary_loss_clip": 0.01587286, + "auxiliary_loss_mlp": 0.01131537, + "balance_loss_clip": 1.19514942, + "balance_loss_mlp": 1.06435037, + "epoch": 0.012986232189021825, + "flos": 23948969978880.0, + "grad_norm": 2.1263203971495717, + "language_loss": 0.96458864, + "learning_rate": 3.3919510048344295e-06, + "loss": 0.99177688, + "num_input_tokens_seen": 2315740, + "step": 108, + "time_per_iteration": 2.6117992401123047 + }, + { + "auxiliary_loss_clip": 0.01576402, + "auxiliary_loss_mlp": 0.01131786, + "balance_loss_clip": 1.19047308, + "balance_loss_mlp": 1.06994045, + "epoch": 0.013106475079660914, + "flos": 23725425686400.0, + "grad_norm": 4.195907912856149, + "language_loss": 0.86752558, + "learning_rate": 3.3986279732976907e-06, + "loss": 0.89460742, + "num_input_tokens_seen": 2334215, + "step": 109, + "time_per_iteration": 2.633835792541504 + }, + { + "auxiliary_loss_clip": 0.01570618, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_clip": 1.18739426, + "balance_loss_mlp": 1.051144, + "epoch": 0.013226717970300006, + "flos": 21102438925440.0, + "grad_norm": 2.031351529118376, + "language_loss": 0.9572528, + "learning_rate": 3.4052439640284983e-06, + "loss": 0.98408073, + "num_input_tokens_seen": 2353130, + "step": 110, + "time_per_iteration": 2.610891580581665 + }, + { + "auxiliary_loss_clip": 0.01571483, + "auxiliary_loss_mlp": 0.01129761, + "balance_loss_clip": 1.1909548, + "balance_loss_mlp": 1.06691408, + "epoch": 0.013346960860939098, + "flos": 24863902231680.0, + "grad_norm": 1.7554086602588028, + "language_loss": 0.8135798, + "learning_rate": 3.4118000807190217e-06, + "loss": 0.84059227, + "num_input_tokens_seen": 2374010, + "step": 111, + "time_per_iteration": 2.696084499359131 + }, + { + "auxiliary_loss_clip": 0.01574202, + "auxiliary_loss_mlp": 0.01128873, + "balance_loss_clip": 1.18895674, + "balance_loss_mlp": 1.06802821, + "epoch": 0.013467203751578187, + "flos": 28181940140160.0, + "grad_norm": 1.8017197257528965, + "language_loss": 0.76225603, + "learning_rate": 3.4182973973648723e-06, + "loss": 0.78928673, + "num_input_tokens_seen": 2395220, + "step": 112, + "time_per_iteration": 2.6860828399658203 + }, + { + "auxiliary_loss_clip": 0.01561408, + "auxiliary_loss_mlp": 0.01148678, + "balance_loss_clip": 1.18500376, + "balance_loss_mlp": 1.08821487, + "epoch": 0.013587446642217279, + "flos": 18916233546240.0, + "grad_norm": 2.3543122290576055, + "language_loss": 0.95194882, + "learning_rate": 3.424736959321014e-06, + "loss": 0.97904968, + "num_input_tokens_seen": 2413025, + "step": 113, + "time_per_iteration": 2.575143575668335 + }, + { + "auxiliary_loss_clip": 0.01564182, + "auxiliary_loss_mlp": 0.01142649, + "balance_loss_clip": 1.18463874, + "balance_loss_mlp": 1.08058834, + "epoch": 0.01370768953285637, + "flos": 23988615615360.0, + "grad_norm": 2.4744194211300097, + "language_loss": 0.889431, + "learning_rate": 3.431119784311155e-06, + "loss": 0.91649926, + "num_input_tokens_seen": 2432700, + "step": 114, + "time_per_iteration": 2.608177661895752 + }, + { + "auxiliary_loss_clip": 0.01549295, + "auxiliary_loss_mlp": 0.01130157, + "balance_loss_clip": 1.17954278, + "balance_loss_mlp": 1.07250714, + "epoch": 0.01382793242349546, + "flos": 39202565512320.0, + "grad_norm": 1.638271705124018, + "language_loss": 0.77672195, + "learning_rate": 3.43744686339307e-06, + "loss": 0.80351645, + "num_input_tokens_seen": 2455020, + "step": 115, + "time_per_iteration": 2.7269628047943115 + }, + { + "auxiliary_loss_clip": 0.01544967, + "auxiliary_loss_mlp": 0.010953, + "balance_loss_clip": 1.17391562, + "balance_loss_mlp": 1.0419656, + "epoch": 0.013948175314134552, + "flos": 41353506714240.0, + "grad_norm": 2.368189704300822, + "language_loss": 0.9097631, + "learning_rate": 3.44371916188212e-06, + "loss": 0.93616581, + "num_input_tokens_seen": 2475775, + "step": 116, + "time_per_iteration": 2.746272563934326 + }, + { + "auxiliary_loss_clip": 0.01539977, + "auxiliary_loss_mlp": 0.01110158, + "balance_loss_clip": 1.17387915, + "balance_loss_mlp": 1.05901718, + "epoch": 0.014068418204773643, + "flos": 22453542028800.0, + "grad_norm": 2.4300197062005022, + "language_loss": 0.86365074, + "learning_rate": 3.449937620235143e-06, + "loss": 0.8901521, + "num_input_tokens_seen": 2496370, + "step": 117, + "time_per_iteration": 2.637294292449951 + }, + { + "auxiliary_loss_clip": 0.01541673, + "auxiliary_loss_mlp": 0.01114464, + "balance_loss_clip": 1.1748333, + "balance_loss_mlp": 1.060987, + "epoch": 0.014188661095412733, + "flos": 23805147922560.0, + "grad_norm": 2.1574447369944614, + "language_loss": 0.89566547, + "learning_rate": 3.456103154896722e-06, + "loss": 0.92222685, + "num_input_tokens_seen": 2517645, + "step": 118, + "time_per_iteration": 2.635037660598755 + }, + { + "auxiliary_loss_clip": 0.01528601, + "auxiliary_loss_mlp": 0.0112393, + "balance_loss_clip": 1.16721666, + "balance_loss_mlp": 1.0726459, + "epoch": 0.014308903986051825, + "flos": 23660248458240.0, + "grad_norm": 1.87527939650359, + "language_loss": 0.92715997, + "learning_rate": 3.462216659109757e-06, + "loss": 0.95368528, + "num_input_tokens_seen": 2537825, + "step": 119, + "time_per_iteration": 2.5826480388641357 + }, + { + "auxiliary_loss_clip": 0.01548501, + "auxiliary_loss_mlp": 0.011336, + "balance_loss_clip": 1.17659259, + "balance_loss_mlp": 1.08133817, + "epoch": 0.014429146876690916, + "flos": 20667991927680.0, + "grad_norm": 2.574267891817223, + "language_loss": 0.85374397, + "learning_rate": 3.4682790036921077e-06, + "loss": 0.88056493, + "num_input_tokens_seen": 2556485, + "step": 120, + "time_per_iteration": 2.568709135055542 + }, + { + "auxiliary_loss_clip": 0.01522954, + "auxiliary_loss_mlp": 0.01109574, + "balance_loss_clip": 1.1682452, + "balance_loss_mlp": 1.06606269, + "epoch": 0.014549389767330006, + "flos": 20229199384320.0, + "grad_norm": 1.861321878702916, + "language_loss": 0.83202308, + "learning_rate": 3.4742910377810193e-06, + "loss": 0.85834837, + "num_input_tokens_seen": 2573945, + "step": 121, + "time_per_iteration": 2.618236541748047 + }, + { + "auxiliary_loss_clip": 0.01520957, + "auxiliary_loss_mlp": 0.01119775, + "balance_loss_clip": 1.16583323, + "balance_loss_mlp": 1.07340288, + "epoch": 0.014669632657969098, + "flos": 18004174381440.0, + "grad_norm": 2.4903333954254143, + "language_loss": 0.88693422, + "learning_rate": 3.4802535895469042e-06, + "loss": 0.91334158, + "num_input_tokens_seen": 2592695, + "step": 122, + "time_per_iteration": 2.7141149044036865 + }, + { + "auxiliary_loss_clip": 0.01523539, + "auxiliary_loss_mlp": 0.0111212, + "balance_loss_clip": 1.1650703, + "balance_loss_mlp": 1.06543732, + "epoch": 0.01478987554860819, + "flos": 22741796672640.0, + "grad_norm": 2.0415616141647654, + "language_loss": 0.89808935, + "learning_rate": 3.4861674668779934e-06, + "loss": 0.92444593, + "num_input_tokens_seen": 2610925, + "step": 123, + "time_per_iteration": 2.693521022796631 + }, + { + "auxiliary_loss_clip": 0.01514595, + "auxiliary_loss_mlp": 0.01106452, + "balance_loss_clip": 1.16026092, + "balance_loss_mlp": 1.05831575, + "epoch": 0.01491011843924728, + "flos": 17198590106880.0, + "grad_norm": 5.187727928313824, + "language_loss": 0.84371674, + "learning_rate": 3.492033458037272e-06, + "loss": 0.86992723, + "num_input_tokens_seen": 2629495, + "step": 124, + "time_per_iteration": 3.479440450668335 + }, + { + "auxiliary_loss_clip": 0.01510728, + "auxiliary_loss_mlp": 0.01110825, + "balance_loss_clip": 1.15768313, + "balance_loss_mlp": 1.06760001, + "epoch": 0.01503036132988637, + "flos": 17673867889920.0, + "grad_norm": 2.664610213426129, + "language_loss": 0.87365061, + "learning_rate": 3.497852332293018e-06, + "loss": 0.8998661, + "num_input_tokens_seen": 2645070, + "step": 125, + "time_per_iteration": 3.4887735843658447 + }, + { + "auxiliary_loss_clip": 0.01510354, + "auxiliary_loss_mlp": 0.0111473, + "balance_loss_clip": 1.15976214, + "balance_loss_mlp": 1.07255375, + "epoch": 0.015150604220525462, + "flos": 18878239935360.0, + "grad_norm": 2.541710048970901, + "language_loss": 0.96627098, + "learning_rate": 3.5036248405242356e-06, + "loss": 0.99252188, + "num_input_tokens_seen": 2663825, + "step": 126, + "time_per_iteration": 2.642979621887207 + }, + { + "auxiliary_loss_clip": 0.0151066, + "auxiliary_loss_mlp": 0.01113067, + "balance_loss_clip": 1.15819657, + "balance_loss_mlp": 1.0669564, + "epoch": 0.015270847111164552, + "flos": 39420184060800.0, + "grad_norm": 1.864338377495763, + "language_loss": 0.82909191, + "learning_rate": 3.509351715802146e-06, + "loss": 0.85532922, + "num_input_tokens_seen": 2684710, + "step": 127, + "time_per_iteration": 2.7415287494659424 + }, + { + "auxiliary_loss_clip": 0.01508807, + "auxiliary_loss_mlp": 0.01124026, + "balance_loss_clip": 1.1564486, + "balance_loss_mlp": 1.07691443, + "epoch": 0.015391090001803644, + "flos": 43762466286720.0, + "grad_norm": 6.730558788297959, + "language_loss": 0.78354734, + "learning_rate": 3.5150336739488763e-06, + "loss": 0.80987561, + "num_input_tokens_seen": 2706995, + "step": 128, + "time_per_iteration": 2.8090872764587402 + }, + { + "auxiliary_loss_clip": 0.01503317, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_clip": 1.15659976, + "balance_loss_mlp": 1.05115986, + "epoch": 0.015511332892442733, + "flos": 18916341287040.0, + "grad_norm": 1.9427516344510842, + "language_loss": 0.84346092, + "learning_rate": 3.5206714140744143e-06, + "loss": 0.8694005, + "num_input_tokens_seen": 2727050, + "step": 129, + "time_per_iteration": 2.6808488368988037 + }, + { + "auxiliary_loss_clip": 0.01505657, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.15936875, + "balance_loss_mlp": 1.07577288, + "epoch": 0.015631575783081827, + "flos": 24535283679360.0, + "grad_norm": 4.73754184153692, + "language_loss": 0.87491655, + "learning_rate": 3.5262656190928208e-06, + "loss": 0.90115762, + "num_input_tokens_seen": 2745350, + "step": 130, + "time_per_iteration": 2.5958337783813477 + }, + { + "auxiliary_loss_clip": 0.01452486, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.17128634, + "balance_loss_mlp": 1.01797032, + "epoch": 0.015751818673720917, + "flos": 62328536098560.0, + "grad_norm": 1.0704955359163306, + "language_loss": 0.7151376, + "learning_rate": 3.5318169562186737e-06, + "loss": 0.73997855, + "num_input_tokens_seen": 2814195, + "step": 131, + "time_per_iteration": 3.183711528778076 + }, + { + "auxiliary_loss_clip": 0.01492113, + "auxiliary_loss_mlp": 0.01122596, + "balance_loss_clip": 1.15217769, + "balance_loss_mlp": 1.08251762, + "epoch": 0.015872061564360006, + "flos": 23878549365120.0, + "grad_norm": 1.7558376584742281, + "language_loss": 0.82305408, + "learning_rate": 3.5373260774446292e-06, + "loss": 0.8492012, + "num_input_tokens_seen": 2834645, + "step": 132, + "time_per_iteration": 2.642026901245117 + }, + { + "auxiliary_loss_clip": 0.01489732, + "auxiliary_loss_mlp": 0.01116723, + "balance_loss_clip": 1.15059876, + "balance_loss_mlp": 1.07655001, + "epoch": 0.0159923044549991, + "flos": 23367899664000.0, + "grad_norm": 1.9892344404559494, + "language_loss": 0.90380722, + "learning_rate": 3.542793620000961e-06, + "loss": 0.9298718, + "num_input_tokens_seen": 2854120, + "step": 133, + "time_per_iteration": 2.566098928451538 + }, + { + "auxiliary_loss_clip": 0.01487185, + "auxiliary_loss_mlp": 0.01106252, + "balance_loss_clip": 1.14991784, + "balance_loss_mlp": 1.06557775, + "epoch": 0.01611254734563819, + "flos": 17858305249920.0, + "grad_norm": 2.376586049494334, + "language_loss": 0.86960846, + "learning_rate": 3.5482202067978894e-06, + "loss": 0.8955428, + "num_input_tokens_seen": 2871330, + "step": 134, + "time_per_iteration": 2.5655272006988525 + }, + { + "auxiliary_loss_clip": 0.01485696, + "auxiliary_loss_mlp": 0.01100606, + "balance_loss_clip": 1.1504935, + "balance_loss_mlp": 1.06079006, + "epoch": 0.01623279023627728, + "flos": 20954774113920.0, + "grad_norm": 2.9914096949009026, + "language_loss": 0.76311105, + "learning_rate": 3.553606446851471e-06, + "loss": 0.78897405, + "num_input_tokens_seen": 2888070, + "step": 135, + "time_per_iteration": 2.590481996536255 + }, + { + "auxiliary_loss_clip": 0.01472415, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_clip": 1.14223146, + "balance_loss_mlp": 1.05871665, + "epoch": 0.016353033126916373, + "flos": 15742412743680.0, + "grad_norm": 1.9208589061965191, + "language_loss": 0.83461642, + "learning_rate": 3.5589529356937613e-06, + "loss": 0.86032015, + "num_input_tokens_seen": 2906465, + "step": 136, + "time_per_iteration": 2.5364911556243896 + }, + { + "auxiliary_loss_clip": 0.0148191, + "auxiliary_loss_mlp": 0.01101312, + "balance_loss_clip": 1.14590597, + "balance_loss_mlp": 1.0625453, + "epoch": 0.016473276017555463, + "flos": 18807280617600.0, + "grad_norm": 2.457584216005113, + "language_loss": 0.77103424, + "learning_rate": 3.5642602557679627e-06, + "loss": 0.79686648, + "num_input_tokens_seen": 2924915, + "step": 137, + "time_per_iteration": 2.5617358684539795 + }, + { + "auxiliary_loss_clip": 0.01475814, + "auxiliary_loss_mlp": 0.01091411, + "balance_loss_clip": 1.15158582, + "balance_loss_mlp": 1.05798447, + "epoch": 0.016593518908194552, + "flos": 24352641999360.0, + "grad_norm": 3.3367477928744576, + "language_loss": 0.84288585, + "learning_rate": 3.569528976809202e-06, + "loss": 0.86855817, + "num_input_tokens_seen": 2942130, + "step": 138, + "time_per_iteration": 2.5901236534118652 + }, + { + "auxiliary_loss_clip": 0.0147646, + "auxiliary_loss_mlp": 0.01108912, + "balance_loss_clip": 1.14605927, + "balance_loss_mlp": 1.06919122, + "epoch": 0.016713761798833646, + "flos": 22346133384960.0, + "grad_norm": 1.7246101274504957, + "language_loss": 0.89972842, + "learning_rate": 3.5747596562115522e-06, + "loss": 0.92558217, + "num_input_tokens_seen": 2962745, + "step": 139, + "time_per_iteration": 2.6615800857543945 + }, + { + "auxiliary_loss_clip": 0.01480523, + "auxiliary_loss_mlp": 0.01106728, + "balance_loss_clip": 1.14758432, + "balance_loss_mlp": 1.06929624, + "epoch": 0.016834004689472735, + "flos": 17821820010240.0, + "grad_norm": 2.316607481235765, + "language_loss": 0.90950894, + "learning_rate": 3.5799528393819138e-06, + "loss": 0.93538153, + "num_input_tokens_seen": 2981825, + "step": 140, + "time_per_iteration": 2.668705701828003 + }, + { + "auxiliary_loss_clip": 0.01462849, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_clip": 1.13782465, + "balance_loss_mlp": 1.06509459, + "epoch": 0.016954247580111825, + "flos": 20519501103360.0, + "grad_norm": 2.218289125596974, + "language_loss": 0.8820675, + "learning_rate": 3.585109060081286e-06, + "loss": 0.90769857, + "num_input_tokens_seen": 3001625, + "step": 141, + "time_per_iteration": 2.556135416030884 + }, + { + "auxiliary_loss_clip": 0.01469179, + "auxiliary_loss_mlp": 0.011012, + "balance_loss_clip": 1.14193761, + "balance_loss_mlp": 1.06574762, + "epoch": 0.017074490470750915, + "flos": 22088869200000.0, + "grad_norm": 1.7956846231971892, + "language_loss": 0.78660738, + "learning_rate": 3.590228840753992e-06, + "loss": 0.81231105, + "num_input_tokens_seen": 3022055, + "step": 142, + "time_per_iteration": 2.580550193786621 + }, + { + "auxiliary_loss_clip": 0.01459938, + "auxiliary_loss_mlp": 0.01103534, + "balance_loss_clip": 1.13899851, + "balance_loss_mlp": 1.07010746, + "epoch": 0.01719473336139001, + "flos": 15997270717440.0, + "grad_norm": 2.028740299574435, + "language_loss": 0.87280518, + "learning_rate": 3.5953126928453423e-06, + "loss": 0.89843988, + "num_input_tokens_seen": 3039605, + "step": 143, + "time_per_iteration": 2.516853094100952 + }, + { + "auxiliary_loss_clip": 0.01457036, + "auxiliary_loss_mlp": 0.01084669, + "balance_loss_clip": 1.13552809, + "balance_loss_mlp": 1.05260146, + "epoch": 0.017314976252029098, + "flos": 22492038430080.0, + "grad_norm": 1.9655212222122012, + "language_loss": 0.80619258, + "learning_rate": 3.600361117108239e-06, + "loss": 0.83160961, + "num_input_tokens_seen": 3059405, + "step": 144, + "time_per_iteration": 2.574777364730835 + }, + { + "auxiliary_loss_clip": 0.01461691, + "auxiliary_loss_mlp": 0.01090405, + "balance_loss_clip": 1.13679838, + "balance_loss_mlp": 1.05633473, + "epoch": 0.017435219142668188, + "flos": 22018053536640.0, + "grad_norm": 2.0625245833236034, + "language_loss": 0.97196126, + "learning_rate": 3.6053746038991616e-06, + "loss": 0.99748224, + "num_input_tokens_seen": 3078490, + "step": 145, + "time_per_iteration": 2.5558927059173584 + }, + { + "auxiliary_loss_clip": 0.01407729, + "auxiliary_loss_mlp": 0.01010883, + "balance_loss_clip": 1.15419316, + "balance_loss_mlp": 0.99982017, + "epoch": 0.01755546203330728, + "flos": 72240526149120.0, + "grad_norm": 1.0618487226235793, + "language_loss": 0.58423042, + "learning_rate": 3.6103536334639843e-06, + "loss": 0.60841656, + "num_input_tokens_seen": 3131755, + "step": 146, + "time_per_iteration": 3.109741687774658 + }, + { + "auxiliary_loss_clip": 0.01451087, + "auxiliary_loss_mlp": 0.01087139, + "balance_loss_clip": 1.13362956, + "balance_loss_mlp": 1.05540562, + "epoch": 0.01767570492394637, + "flos": 25337061112320.0, + "grad_norm": 1.9974747287052388, + "language_loss": 0.85786569, + "learning_rate": 3.615298676214041e-06, + "loss": 0.88324791, + "num_input_tokens_seen": 3152035, + "step": 147, + "time_per_iteration": 2.617709159851074 + }, + { + "auxiliary_loss_clip": 0.01449112, + "auxiliary_loss_mlp": 0.010997, + "balance_loss_clip": 1.13172388, + "balance_loss_mlp": 1.06900418, + "epoch": 0.01779594781458546, + "flos": 20449188230400.0, + "grad_norm": 2.1250273792771837, + "language_loss": 0.88978142, + "learning_rate": 3.6202101929928317e-06, + "loss": 0.91526949, + "num_input_tokens_seen": 3170625, + "step": 148, + "time_per_iteration": 2.5402638912200928 + }, + { + "auxiliary_loss_clip": 0.01443985, + "auxiliary_loss_mlp": 0.01093728, + "balance_loss_clip": 1.12985492, + "balance_loss_mlp": 1.06410456, + "epoch": 0.017916190705224554, + "flos": 16253601148800.0, + "grad_norm": 2.0052308562285535, + "language_loss": 0.88471079, + "learning_rate": 3.6250886353337413e-06, + "loss": 0.91008788, + "num_input_tokens_seen": 3188155, + "step": 149, + "time_per_iteration": 2.5184099674224854 + }, + { + "auxiliary_loss_clip": 0.01456577, + "auxiliary_loss_mlp": 0.01095739, + "balance_loss_clip": 1.13688076, + "balance_loss_mlp": 1.06607938, + "epoch": 0.018036433595863644, + "flos": 23330588411520.0, + "grad_norm": 2.5097661483820266, + "language_loss": 0.86353678, + "learning_rate": 3.6299344457091488e-06, + "loss": 0.8890599, + "num_input_tokens_seen": 3209015, + "step": 150, + "time_per_iteration": 2.6522512435913086 + }, + { + "auxiliary_loss_clip": 0.01448062, + "auxiliary_loss_mlp": 0.01085606, + "balance_loss_clip": 1.1330452, + "balance_loss_mlp": 1.05735326, + "epoch": 0.018156676486502734, + "flos": 18588010043520.0, + "grad_norm": 2.1232483638502346, + "language_loss": 0.93856239, + "learning_rate": 3.634748057771256e-06, + "loss": 0.96389908, + "num_input_tokens_seen": 3224955, + "step": 151, + "time_per_iteration": 4.405697584152222 + }, + { + "auxiliary_loss_clip": 0.01441102, + "auxiliary_loss_mlp": 0.01090787, + "balance_loss_clip": 1.13099337, + "balance_loss_mlp": 1.06257057, + "epoch": 0.018276919377141827, + "flos": 25448707560960.0, + "grad_norm": 1.8128308807158346, + "language_loss": 0.85941714, + "learning_rate": 3.639529896584965e-06, + "loss": 0.88473606, + "num_input_tokens_seen": 3246330, + "step": 152, + "time_per_iteration": 3.4095232486724854 + }, + { + "auxiliary_loss_clip": 0.01442831, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_clip": 1.13019538, + "balance_loss_mlp": 1.05102372, + "epoch": 0.018397162267780917, + "flos": 20047311889920.0, + "grad_norm": 2.9186730443635307, + "language_loss": 0.89245641, + "learning_rate": 3.6442803788531233e-06, + "loss": 0.91769302, + "num_input_tokens_seen": 3264290, + "step": 153, + "time_per_iteration": 3.3136463165283203 + }, + { + "auxiliary_loss_clip": 0.0144434, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_clip": 1.12938178, + "balance_loss_mlp": 1.06197381, + "epoch": 0.018517405158420007, + "flos": 27565282425600.0, + "grad_norm": 2.2904171111991043, + "language_loss": 0.96016854, + "learning_rate": 3.6489999131344357e-06, + "loss": 0.98553991, + "num_input_tokens_seen": 3287065, + "step": 154, + "time_per_iteration": 2.584468126296997 + }, + { + "auxiliary_loss_clip": 0.01432768, + "auxiliary_loss_mlp": 0.0109073, + "balance_loss_clip": 1.12618279, + "balance_loss_mlp": 1.06401515, + "epoch": 0.0186376480490591, + "flos": 19354056422400.0, + "grad_norm": 1.7776320407094601, + "language_loss": 0.9060998, + "learning_rate": 3.653688900054313e-06, + "loss": 0.93133479, + "num_input_tokens_seen": 3305595, + "step": 155, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01435272, + "auxiliary_loss_mlp": 0.01069098, + "balance_loss_clip": 1.12378955, + "balance_loss_mlp": 1.04096508, + "epoch": 0.01875789093969819, + "flos": 26687840993280.0, + "grad_norm": 2.417757028032494, + "language_loss": 0.76047444, + "learning_rate": 3.6583477325089526e-06, + "loss": 0.78551811, + "num_input_tokens_seen": 3326135, + "step": 156, + "time_per_iteration": 2.585200548171997 + }, + { + "auxiliary_loss_clip": 0.01429677, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_clip": 1.12295747, + "balance_loss_mlp": 1.0518589, + "epoch": 0.01887813383033728, + "flos": 24353001135360.0, + "grad_norm": 2.207466324006412, + "language_loss": 1.04357588, + "learning_rate": 3.6629767958628916e-06, + "loss": 1.0686655, + "num_input_tokens_seen": 3343510, + "step": 157, + "time_per_iteration": 2.5631606578826904 + }, + { + "auxiliary_loss_clip": 0.01426265, + "auxiliary_loss_mlp": 0.01080537, + "balance_loss_clip": 1.12468541, + "balance_loss_mlp": 1.05251074, + "epoch": 0.018998376720976373, + "flos": 14647532330880.0, + "grad_norm": 2.4247202971029784, + "language_loss": 0.85458946, + "learning_rate": 3.667576468140291e-06, + "loss": 0.87965751, + "num_input_tokens_seen": 3361325, + "step": 158, + "time_per_iteration": 2.5459694862365723 + }, + { + "auxiliary_loss_clip": 0.01419749, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_clip": 1.11828184, + "balance_loss_mlp": 1.03892207, + "epoch": 0.019118619611615463, + "flos": 29305261146240.0, + "grad_norm": 2.211349109085119, + "language_loss": 0.88938743, + "learning_rate": 3.672147120210184e-06, + "loss": 0.91422796, + "num_input_tokens_seen": 3377925, + "step": 159, + "time_per_iteration": 2.599454164505005 + }, + { + "auxiliary_loss_clip": 0.01426579, + "auxiliary_loss_mlp": 0.01076261, + "balance_loss_clip": 1.12569165, + "balance_loss_mlp": 1.0512867, + "epoch": 0.019238862502254553, + "flos": 20886723797760.0, + "grad_norm": 1.9756263559848335, + "language_loss": 0.86620474, + "learning_rate": 3.6766891159659177e-06, + "loss": 0.89123309, + "num_input_tokens_seen": 3396335, + "step": 160, + "time_per_iteration": 2.537170886993408 + }, + { + "auxiliary_loss_clip": 0.01426305, + "auxiliary_loss_mlp": 0.01078336, + "balance_loss_clip": 1.12688708, + "balance_loss_mlp": 1.05296862, + "epoch": 0.019359105392893646, + "flos": 21360672777600.0, + "grad_norm": 3.2657088674070787, + "language_loss": 0.8801288, + "learning_rate": 3.6812028124990075e-06, + "loss": 0.90517521, + "num_input_tokens_seen": 3413605, + "step": 161, + "time_per_iteration": 2.6275370121002197 + }, + { + "auxiliary_loss_clip": 0.01421357, + "auxiliary_loss_mlp": 0.01082785, + "balance_loss_clip": 1.1231519, + "balance_loss_mlp": 1.05837083, + "epoch": 0.019479348283532736, + "flos": 16283729681280.0, + "grad_norm": 4.047007746984585, + "language_loss": 0.81460655, + "learning_rate": 3.6856885602676016e-06, + "loss": 0.83964801, + "num_input_tokens_seen": 3429640, + "step": 162, + "time_per_iteration": 2.507634162902832 + }, + { + "auxiliary_loss_clip": 0.01420076, + "auxiliary_loss_mlp": 0.01084318, + "balance_loss_clip": 1.12288117, + "balance_loss_mlp": 1.06010652, + "epoch": 0.019599591174171826, + "flos": 22091239497600.0, + "grad_norm": 2.044509185201751, + "language_loss": 0.94333065, + "learning_rate": 3.6901467032597733e-06, + "loss": 0.96837461, + "num_input_tokens_seen": 3448125, + "step": 163, + "time_per_iteration": 2.6433823108673096 + }, + { + "auxiliary_loss_clip": 0.01422195, + "auxiliary_loss_mlp": 0.01070052, + "balance_loss_clip": 1.12144852, + "balance_loss_mlp": 1.04351616, + "epoch": 0.01971983406481092, + "flos": 19609668581760.0, + "grad_norm": 2.2499308276572494, + "language_loss": 0.87472332, + "learning_rate": 3.694577579151804e-06, + "loss": 0.89964581, + "num_input_tokens_seen": 3466535, + "step": 164, + "time_per_iteration": 2.6180763244628906 + }, + { + "auxiliary_loss_clip": 0.01421243, + "auxiliary_loss_mlp": 0.01079805, + "balance_loss_clip": 1.12326443, + "balance_loss_mlp": 1.05434155, + "epoch": 0.01984007695545001, + "flos": 19099342103040.0, + "grad_norm": 2.2160376331278835, + "language_loss": 0.73723412, + "learning_rate": 3.6989815194616703e-06, + "loss": 0.76224458, + "num_input_tokens_seen": 3483730, + "step": 165, + "time_per_iteration": 2.5633740425109863 + }, + { + "auxiliary_loss_clip": 0.01420118, + "auxiliary_loss_mlp": 0.01080578, + "balance_loss_clip": 1.11949909, + "balance_loss_mlp": 1.05358934, + "epoch": 0.0199603198460891, + "flos": 20848406964480.0, + "grad_norm": 2.8469479377242544, + "language_loss": 0.79968739, + "learning_rate": 3.703358849697888e-06, + "loss": 0.8246944, + "num_input_tokens_seen": 3503640, + "step": 166, + "time_per_iteration": 2.601308584213257 + }, + { + "auxiliary_loss_clip": 0.01417344, + "auxiliary_loss_mlp": 0.01089618, + "balance_loss_clip": 1.12307215, + "balance_loss_mlp": 1.06593156, + "epoch": 0.020080562736728192, + "flos": 21870747861120.0, + "grad_norm": 1.7661579960379223, + "language_loss": 0.82708067, + "learning_rate": 3.7077098895038803e-06, + "loss": 0.85215032, + "num_input_tokens_seen": 3523010, + "step": 167, + "time_per_iteration": 2.554431200027466 + }, + { + "auxiliary_loss_clip": 0.01416286, + "auxiliary_loss_mlp": 0.01076642, + "balance_loss_clip": 1.12099493, + "balance_loss_mlp": 1.05264544, + "epoch": 0.020200805627367282, + "flos": 21688788539520.0, + "grad_norm": 2.228780715329253, + "language_loss": 0.97167206, + "learning_rate": 3.712034952798045e-06, + "loss": 0.99660122, + "num_input_tokens_seen": 3541125, + "step": 168, + "time_per_iteration": 2.521315097808838 + }, + { + "auxiliary_loss_clip": 0.0141256, + "auxiliary_loss_mlp": 0.01084325, + "balance_loss_clip": 1.1159389, + "balance_loss_mlp": 1.05980313, + "epoch": 0.02032104851800637, + "flos": 33543043729920.0, + "grad_norm": 5.603565505184597, + "language_loss": 0.84590304, + "learning_rate": 3.7163343479096656e-06, + "loss": 0.87087184, + "num_input_tokens_seen": 3562700, + "step": 169, + "time_per_iteration": 2.6373016834259033 + }, + { + "auxiliary_loss_clip": 0.01410202, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_clip": 1.11913192, + "balance_loss_mlp": 1.05486584, + "epoch": 0.020441291408645465, + "flos": 31686965274240.0, + "grad_norm": 2.948590484015234, + "language_loss": 0.83056402, + "learning_rate": 3.720608377710802e-06, + "loss": 0.8554334, + "num_input_tokens_seen": 3582790, + "step": 170, + "time_per_iteration": 2.628444194793701 + }, + { + "auxiliary_loss_clip": 0.01403473, + "auxiliary_loss_mlp": 0.01087102, + "balance_loss_clip": 1.1130209, + "balance_loss_mlp": 1.0624969, + "epoch": 0.020561534299284555, + "flos": 20886687884160.0, + "grad_norm": 2.0495006095686255, + "language_loss": 0.86342776, + "learning_rate": 3.7248573397443277e-06, + "loss": 0.88833356, + "num_input_tokens_seen": 3601715, + "step": 171, + "time_per_iteration": 2.5745770931243896 + }, + { + "auxiliary_loss_clip": 0.01409233, + "auxiliary_loss_mlp": 0.0109003, + "balance_loss_clip": 1.11958981, + "balance_loss_mlp": 1.06468582, + "epoch": 0.020681777189923645, + "flos": 20996610480000.0, + "grad_norm": 2.0797316398658534, + "language_loss": 0.97813094, + "learning_rate": 3.729081526348224e-06, + "loss": 1.00312352, + "num_input_tokens_seen": 3620245, + "step": 172, + "time_per_iteration": 2.6195945739746094 + }, + { + "auxiliary_loss_clip": 0.01410013, + "auxiliary_loss_mlp": 0.01068789, + "balance_loss_clip": 1.11768854, + "balance_loss_mlp": 1.04669952, + "epoch": 0.020802020080562738, + "flos": 28257532312320.0, + "grad_norm": 1.936306971283965, + "language_loss": 0.84899962, + "learning_rate": 3.7332812247762777e-06, + "loss": 0.87378764, + "num_input_tokens_seen": 3641545, + "step": 173, + "time_per_iteration": 2.6253838539123535 + }, + { + "auxiliary_loss_clip": 0.01410098, + "auxiliary_loss_mlp": 0.01066071, + "balance_loss_clip": 1.12051344, + "balance_loss_mlp": 1.04311168, + "epoch": 0.020922262971201828, + "flos": 19681274344320.0, + "grad_norm": 2.4853459240993656, + "language_loss": 0.95718849, + "learning_rate": 3.737456717315293e-06, + "loss": 0.98195016, + "num_input_tokens_seen": 3660510, + "step": 174, + "time_per_iteration": 2.6722049713134766 + }, + { + "auxiliary_loss_clip": 0.01398191, + "auxiliary_loss_mlp": 0.01087759, + "balance_loss_clip": 1.11585402, + "balance_loss_mlp": 1.06469226, + "epoch": 0.021042505861840918, + "flos": 15666353694720.0, + "grad_norm": 1.7243215051599294, + "language_loss": 0.9063713, + "learning_rate": 3.7416082813989552e-06, + "loss": 0.93123078, + "num_input_tokens_seen": 3677505, + "step": 175, + "time_per_iteration": 2.5443568229675293 + }, + { + "auxiliary_loss_clip": 0.01406896, + "auxiliary_loss_mlp": 0.01079216, + "balance_loss_clip": 1.11785543, + "balance_loss_mlp": 1.05564809, + "epoch": 0.02116274875248001, + "flos": 21142012734720.0, + "grad_norm": 1.9655892053854447, + "language_loss": 0.89469743, + "learning_rate": 3.745736189718439e-06, + "loss": 0.91955858, + "num_input_tokens_seen": 3696760, + "step": 176, + "time_per_iteration": 2.535235643386841 + }, + { + "auxiliary_loss_clip": 0.01396913, + "auxiliary_loss_mlp": 0.01065821, + "balance_loss_clip": 1.11271834, + "balance_loss_mlp": 1.04332662, + "epoch": 0.0212829916431191, + "flos": 24715770543360.0, + "grad_norm": 2.7040320607491797, + "language_loss": 0.72632468, + "learning_rate": 3.749840710329894e-06, + "loss": 0.75095206, + "num_input_tokens_seen": 3717465, + "step": 177, + "time_per_iteration": 2.575251817703247 + }, + { + "auxiliary_loss_clip": 0.01408161, + "auxiliary_loss_mlp": 0.01085532, + "balance_loss_clip": 1.11644828, + "balance_loss_mlp": 1.06078386, + "epoch": 0.02140323453375819, + "flos": 16645493508480.0, + "grad_norm": 3.6053867680093465, + "language_loss": 0.98011255, + "learning_rate": 3.7539221067588938e-06, + "loss": 1.00504947, + "num_input_tokens_seen": 3731440, + "step": 178, + "time_per_iteration": 4.1849565505981445 + }, + { + "auxiliary_loss_clip": 0.01402694, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_clip": 1.11476171, + "balance_loss_mlp": 1.0603168, + "epoch": 0.021523477424397284, + "flos": 20299332689280.0, + "grad_norm": 4.752666604168373, + "language_loss": 0.93415415, + "learning_rate": 3.757980638101964e-06, + "loss": 0.95902377, + "num_input_tokens_seen": 3744935, + "step": 179, + "time_per_iteration": 3.2877705097198486 + }, + { + "auxiliary_loss_clip": 0.01405218, + "auxiliary_loss_mlp": 0.01076861, + "balance_loss_clip": 1.11657786, + "balance_loss_mlp": 1.05145788, + "epoch": 0.021643720315036374, + "flos": 26104005331200.0, + "grad_norm": 2.4511298870031917, + "language_loss": 0.8913734, + "learning_rate": 3.7620165591252806e-06, + "loss": 0.9161942, + "num_input_tokens_seen": 3763035, + "step": 180, + "time_per_iteration": 3.358435869216919 + }, + { + "auxiliary_loss_clip": 0.01394805, + "auxiliary_loss_mlp": 0.01072653, + "balance_loss_clip": 1.11491966, + "balance_loss_mlp": 1.05084956, + "epoch": 0.021763963205675464, + "flos": 24787663614720.0, + "grad_norm": 1.7665753306860168, + "language_loss": 0.9454397, + "learning_rate": 3.766030120360636e-06, + "loss": 0.97011423, + "num_input_tokens_seen": 3782665, + "step": 181, + "time_per_iteration": 2.548003911972046 + }, + { + "auxiliary_loss_clip": 0.01401304, + "auxiliary_loss_mlp": 0.01074823, + "balance_loss_clip": 1.11475885, + "balance_loss_mlp": 1.05270934, + "epoch": 0.021884206096314557, + "flos": 25813559957760.0, + "grad_norm": 2.1208633989575887, + "language_loss": 0.90331209, + "learning_rate": 3.7700215681987578e-06, + "loss": 0.92807329, + "num_input_tokens_seen": 3802435, + "step": 182, + "time_per_iteration": 2.590991258621216 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_clip": 1.11251068, + "balance_loss_mlp": 1.06237805, + "epoch": 0.022004448986953647, + "flos": 20082719721600.0, + "grad_norm": 1.9346909068289775, + "language_loss": 0.82292002, + "learning_rate": 3.7739911449800767e-06, + "loss": 0.84771615, + "num_input_tokens_seen": 3822490, + "step": 183, + "time_per_iteration": 2.6303699016571045 + }, + { + "auxiliary_loss_clip": 0.01394469, + "auxiliary_loss_mlp": 0.01085175, + "balance_loss_clip": 1.11118197, + "balance_loss_mlp": 1.06401491, + "epoch": 0.022124691877592736, + "flos": 20480609652480.0, + "grad_norm": 2.4506076648523143, + "language_loss": 0.80815208, + "learning_rate": 3.7779390890830114e-06, + "loss": 0.83294851, + "num_input_tokens_seen": 3841140, + "step": 184, + "time_per_iteration": 2.582005500793457 + }, + { + "auxiliary_loss_clip": 0.01394968, + "auxiliary_loss_mlp": 0.01084229, + "balance_loss_clip": 1.11145091, + "balance_loss_mlp": 1.06118631, + "epoch": 0.02224493476823183, + "flos": 23586847015680.0, + "grad_norm": 2.5213755960115054, + "language_loss": 0.85919791, + "learning_rate": 3.7818656350098723e-06, + "loss": 0.88398993, + "num_input_tokens_seen": 3862090, + "step": 185, + "time_per_iteration": 2.622337818145752 + }, + { + "auxiliary_loss_clip": 0.0138934, + "auxiliary_loss_mlp": 0.01075058, + "balance_loss_clip": 1.10778046, + "balance_loss_mlp": 1.05164576, + "epoch": 0.02236517765887092, + "flos": 16909940413440.0, + "grad_norm": 2.5423084023343017, + "language_loss": 0.7713865, + "learning_rate": 3.7857710134704447e-06, + "loss": 0.79603046, + "num_input_tokens_seen": 3881025, + "step": 186, + "time_per_iteration": 2.530702829360962 + }, + { + "auxiliary_loss_clip": 0.01388985, + "auxiliary_loss_mlp": 0.01057624, + "balance_loss_clip": 1.1122731, + "balance_loss_mlp": 1.0367744, + "epoch": 0.02248542054951001, + "flos": 43508182930560.0, + "grad_norm": 3.055214000823869, + "language_loss": 0.79375297, + "learning_rate": 3.7896554514633234e-06, + "loss": 0.81821907, + "num_input_tokens_seen": 3905310, + "step": 187, + "time_per_iteration": 2.703484296798706 + }, + { + "auxiliary_loss_clip": 0.01387329, + "auxiliary_loss_mlp": 0.01068577, + "balance_loss_clip": 1.11025524, + "balance_loss_mlp": 1.04735792, + "epoch": 0.022605663440149103, + "flos": 23367648268800.0, + "grad_norm": 3.9384464579489027, + "language_loss": 0.84562314, + "learning_rate": 3.7935191723550955e-06, + "loss": 0.87018216, + "num_input_tokens_seen": 3924265, + "step": 188, + "time_per_iteration": 2.602079391479492 + }, + { + "auxiliary_loss_clip": 0.01386209, + "auxiliary_loss_mlp": 0.01071827, + "balance_loss_clip": 1.10891056, + "balance_loss_mlp": 1.05145431, + "epoch": 0.022725906330788193, + "flos": 29019915504000.0, + "grad_norm": 2.1404268308171406, + "language_loss": 0.88815933, + "learning_rate": 3.797362395957408e-06, + "loss": 0.91273969, + "num_input_tokens_seen": 3944830, + "step": 189, + "time_per_iteration": 2.6051111221313477 + }, + { + "auxiliary_loss_clip": 0.01396753, + "auxiliary_loss_mlp": 0.01070363, + "balance_loss_clip": 1.11555684, + "balance_loss_mlp": 1.04886961, + "epoch": 0.022846149221427282, + "flos": 24496176746880.0, + "grad_norm": 2.0484068975039684, + "language_loss": 0.7848438, + "learning_rate": 3.8011853386020055e-06, + "loss": 0.809515, + "num_input_tokens_seen": 3965735, + "step": 190, + "time_per_iteration": 2.59596586227417 + }, + { + "auxiliary_loss_clip": 0.01392041, + "auxiliary_loss_mlp": 0.01084012, + "balance_loss_clip": 1.11264849, + "balance_loss_mlp": 1.06164813, + "epoch": 0.022966392112066376, + "flos": 15523537219200.0, + "grad_norm": 3.5710873371832825, + "language_loss": 0.90078062, + "learning_rate": 3.804988213213804e-06, + "loss": 0.92554116, + "num_input_tokens_seen": 3983975, + "step": 191, + "time_per_iteration": 2.5427801609039307 + }, + { + "auxiliary_loss_clip": 0.01378642, + "auxiliary_loss_mlp": 0.01020948, + "balance_loss_clip": 1.15688062, + "balance_loss_mlp": 1.00916982, + "epoch": 0.023086635002705466, + "flos": 55650408433920.0, + "grad_norm": 1.0193135268128133, + "language_loss": 0.6316992, + "learning_rate": 3.808771229382049e-06, + "loss": 0.65569508, + "num_input_tokens_seen": 4043440, + "step": 192, + "time_per_iteration": 3.0310378074645996 + }, + { + "auxiliary_loss_clip": 0.01383525, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_clip": 1.11008716, + "balance_loss_mlp": 1.0592916, + "epoch": 0.023206877893344555, + "flos": 19313441118720.0, + "grad_norm": 2.3585366127604876, + "language_loss": 0.84396625, + "learning_rate": 3.8125345934296324e-06, + "loss": 0.86859757, + "num_input_tokens_seen": 4061750, + "step": 193, + "time_per_iteration": 2.5186355113983154 + }, + { + "auxiliary_loss_clip": 0.01384921, + "auxiliary_loss_mlp": 0.01075543, + "balance_loss_clip": 1.10954964, + "balance_loss_mlp": 1.05296457, + "epoch": 0.02332712078398365, + "flos": 23072965090560.0, + "grad_norm": 2.753611041070088, + "language_loss": 0.88016462, + "learning_rate": 3.81627850848061e-06, + "loss": 0.9047693, + "num_input_tokens_seen": 4082345, + "step": 194, + "time_per_iteration": 2.5787055492401123 + }, + { + "auxiliary_loss_clip": 0.01379018, + "auxiliary_loss_mlp": 0.01066263, + "balance_loss_clip": 1.10537577, + "balance_loss_mlp": 1.04605734, + "epoch": 0.02344736367462274, + "flos": 24425971614720.0, + "grad_norm": 2.2039583775921376, + "language_loss": 0.86284089, + "learning_rate": 3.820003174525994e-06, + "loss": 0.88729376, + "num_input_tokens_seen": 4101770, + "step": 195, + "time_per_iteration": 2.538210868835449 + }, + { + "auxiliary_loss_clip": 0.01383532, + "auxiliary_loss_mlp": 0.01074209, + "balance_loss_clip": 1.11042321, + "balance_loss_mlp": 1.05376482, + "epoch": 0.02356760656526183, + "flos": 21579799697280.0, + "grad_norm": 2.3914382843859765, + "language_loss": 0.82955569, + "learning_rate": 3.823708788487851e-06, + "loss": 0.85413301, + "num_input_tokens_seen": 4118770, + "step": 196, + "time_per_iteration": 2.5229756832122803 + }, + { + "auxiliary_loss_clip": 0.01379316, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_clip": 1.10739207, + "balance_loss_mlp": 1.06291866, + "epoch": 0.02368784945590092, + "flos": 25193598192000.0, + "grad_norm": 2.6305134989326135, + "language_loss": 0.84352314, + "learning_rate": 3.827395544281781e-06, + "loss": 0.86814046, + "num_input_tokens_seen": 4141110, + "step": 197, + "time_per_iteration": 2.6391611099243164 + }, + { + "auxiliary_loss_clip": 0.01386246, + "auxiliary_loss_mlp": 0.01080686, + "balance_loss_clip": 1.1109786, + "balance_loss_mlp": 1.06018174, + "epoch": 0.02380809234654001, + "flos": 27562481164800.0, + "grad_norm": 1.999182722210569, + "language_loss": 0.78948164, + "learning_rate": 3.831063632877802e-06, + "loss": 0.81415093, + "num_input_tokens_seen": 4161430, + "step": 198, + "time_per_iteration": 2.576688289642334 + }, + { + "auxiliary_loss_clip": 0.01381736, + "auxiliary_loss_mlp": 0.01072961, + "balance_loss_clip": 1.11401629, + "balance_loss_mlp": 1.05416203, + "epoch": 0.0239283352371791, + "flos": 18259786540800.0, + "grad_norm": 2.379044868441493, + "language_loss": 0.76057124, + "learning_rate": 3.834713242359712e-06, + "loss": 0.78511822, + "num_input_tokens_seen": 4179260, + "step": 199, + "time_per_iteration": 2.490786552429199 + }, + { + "auxiliary_loss_clip": 0.01384182, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_clip": 1.10767508, + "balance_loss_mlp": 1.05190182, + "epoch": 0.02404857812781819, + "flos": 21395110942080.0, + "grad_norm": 3.457610276393604, + "language_loss": 0.87166214, + "learning_rate": 3.838344557982959e-06, + "loss": 0.89624172, + "num_input_tokens_seen": 4200640, + "step": 200, + "time_per_iteration": 2.572953224182129 + }, + { + "auxiliary_loss_clip": 0.01377653, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.10674405, + "balance_loss_mlp": 1.05415249, + "epoch": 0.024168821018457284, + "flos": 16654256426880.0, + "grad_norm": 3.3374394347500886, + "language_loss": 0.84899002, + "learning_rate": 3.841957762231063e-06, + "loss": 0.87352198, + "num_input_tokens_seen": 4218170, + "step": 201, + "time_per_iteration": 2.5260581970214844 + }, + { + "auxiliary_loss_clip": 0.01374378, + "auxiliary_loss_mlp": 0.01063737, + "balance_loss_clip": 1.1042887, + "balance_loss_mlp": 1.0435431, + "epoch": 0.024289063909096374, + "flos": 22820872464000.0, + "grad_norm": 2.4385184604658967, + "language_loss": 0.87626147, + "learning_rate": 3.8455530348706454e-06, + "loss": 0.90064269, + "num_input_tokens_seen": 4237770, + "step": 202, + "time_per_iteration": 2.571443557739258 + }, + { + "auxiliary_loss_clip": 0.01375074, + "auxiliary_loss_mlp": 0.01073974, + "balance_loss_clip": 1.10634232, + "balance_loss_mlp": 1.05511534, + "epoch": 0.024409306799735464, + "flos": 17748598135680.0, + "grad_norm": 2.153162568101511, + "language_loss": 0.77302998, + "learning_rate": 3.849130553005099e-06, + "loss": 0.79752046, + "num_input_tokens_seen": 4255985, + "step": 203, + "time_per_iteration": 2.48757266998291 + }, + { + "auxiliary_loss_clip": 0.01375607, + "auxiliary_loss_mlp": 0.01067165, + "balance_loss_clip": 1.10425639, + "balance_loss_mlp": 1.04836547, + "epoch": 0.024529549690374557, + "flos": 21616213109760.0, + "grad_norm": 3.480862266978662, + "language_loss": 0.8341502, + "learning_rate": 3.852690491126933e-06, + "loss": 0.85857791, + "num_input_tokens_seen": 4276035, + "step": 204, + "time_per_iteration": 3.2992236614227295 + }, + { + "auxiliary_loss_clip": 0.01370282, + "auxiliary_loss_mlp": 0.01062416, + "balance_loss_clip": 1.10067284, + "balance_loss_mlp": 1.04217386, + "epoch": 0.024649792581013647, + "flos": 25551662918400.0, + "grad_norm": 3.0605471564654763, + "language_loss": 0.91415983, + "learning_rate": 3.856233021168845e-06, + "loss": 0.93848681, + "num_input_tokens_seen": 4295730, + "step": 205, + "time_per_iteration": 3.370774984359741 + }, + { + "auxiliary_loss_clip": 0.0136434, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_clip": 1.10094357, + "balance_loss_mlp": 1.03669214, + "epoch": 0.024770035471652737, + "flos": 34495574544000.0, + "grad_norm": 2.242911687815196, + "language_loss": 0.91352785, + "learning_rate": 3.859758312553544e-06, + "loss": 0.93771505, + "num_input_tokens_seen": 4317950, + "step": 206, + "time_per_iteration": 4.205277681350708 + }, + { + "auxiliary_loss_clip": 0.01373758, + "auxiliary_loss_mlp": 0.01069276, + "balance_loss_clip": 1.1070987, + "balance_loss_mlp": 1.05090547, + "epoch": 0.02489027836229183, + "flos": 21505428587520.0, + "grad_norm": 2.119683256324153, + "language_loss": 0.91625947, + "learning_rate": 3.8632665322423735e-06, + "loss": 0.9406898, + "num_input_tokens_seen": 4337605, + "step": 207, + "time_per_iteration": 2.5087318420410156 + }, + { + "auxiliary_loss_clip": 0.01371615, + "auxiliary_loss_mlp": 0.01065658, + "balance_loss_clip": 1.10397506, + "balance_loss_mlp": 1.0465014, + "epoch": 0.02501052125293092, + "flos": 23219013790080.0, + "grad_norm": 1.841662127677893, + "language_loss": 0.86065757, + "learning_rate": 3.866757844782762e-06, + "loss": 0.88503033, + "num_input_tokens_seen": 4358110, + "step": 208, + "time_per_iteration": 2.6086056232452393 + }, + { + "auxiliary_loss_clip": 0.01371228, + "auxiliary_loss_mlp": 0.01063733, + "balance_loss_clip": 1.10509634, + "balance_loss_mlp": 1.04489827, + "epoch": 0.02513076414357001, + "flos": 26388920010240.0, + "grad_norm": 2.8526319740064374, + "language_loss": 0.91058338, + "learning_rate": 3.870232412354527e-06, + "loss": 0.93493301, + "num_input_tokens_seen": 4374955, + "step": 209, + "time_per_iteration": 2.523590326309204 + }, + { + "auxiliary_loss_clip": 0.01367578, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.10237503, + "balance_loss_mlp": 1.04549098, + "epoch": 0.025251007034209103, + "flos": 13590430047360.0, + "grad_norm": 2.0385818608497526, + "language_loss": 0.92587775, + "learning_rate": 3.873690394815086e-06, + "loss": 0.95019805, + "num_input_tokens_seen": 4391535, + "step": 210, + "time_per_iteration": 2.495131015777588 + }, + { + "auxiliary_loss_clip": 0.01365097, + "auxiliary_loss_mlp": 0.01058858, + "balance_loss_clip": 1.09899962, + "balance_loss_mlp": 1.04014182, + "epoch": 0.025371249924848193, + "flos": 15049229103360.0, + "grad_norm": 2.5820015441999047, + "language_loss": 0.91305387, + "learning_rate": 3.877131949743587e-06, + "loss": 0.93729341, + "num_input_tokens_seen": 4408400, + "step": 211, + "time_per_iteration": 2.478599786758423 + }, + { + "auxiliary_loss_clip": 0.01367094, + "auxiliary_loss_mlp": 0.01078591, + "balance_loss_clip": 1.10253382, + "balance_loss_mlp": 1.05948234, + "epoch": 0.025491492815487283, + "flos": 25553853648000.0, + "grad_norm": 2.2464687423561025, + "language_loss": 0.77946156, + "learning_rate": 3.880557232483993e-06, + "loss": 0.80391848, + "num_input_tokens_seen": 4427840, + "step": 212, + "time_per_iteration": 2.5540060997009277 + }, + { + "auxiliary_loss_clip": 0.01366147, + "auxiliary_loss_mlp": 0.01061694, + "balance_loss_clip": 1.09939647, + "balance_loss_mlp": 1.04216766, + "epoch": 0.025611735706126376, + "flos": 20630752502400.0, + "grad_norm": 2.013256768384452, + "language_loss": 0.86888659, + "learning_rate": 3.883966396187164e-06, + "loss": 0.89316499, + "num_input_tokens_seen": 4447110, + "step": 213, + "time_per_iteration": 2.5085840225219727 + }, + { + "auxiliary_loss_clip": 0.0136812, + "auxiliary_loss_mlp": 0.01060864, + "balance_loss_clip": 1.10369563, + "balance_loss_mlp": 1.04280365, + "epoch": 0.025731978596765466, + "flos": 19062282245760.0, + "grad_norm": 2.048296030302431, + "language_loss": 0.90014124, + "learning_rate": 3.887359591851937e-06, + "loss": 0.92443109, + "num_input_tokens_seen": 4464715, + "step": 214, + "time_per_iteration": 2.543308734893799 + }, + { + "auxiliary_loss_clip": 0.01363345, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_clip": 1.10138774, + "balance_loss_mlp": 1.03925502, + "epoch": 0.025852221487404556, + "flos": 22163814927360.0, + "grad_norm": 2.2926812510931076, + "language_loss": 0.92373013, + "learning_rate": 3.890736968365265e-06, + "loss": 0.94795024, + "num_input_tokens_seen": 4485030, + "step": 215, + "time_per_iteration": 2.5321481227874756 + }, + { + "auxiliary_loss_clip": 0.01364322, + "auxiliary_loss_mlp": 0.0106298, + "balance_loss_clip": 1.10002589, + "balance_loss_mlp": 1.04281032, + "epoch": 0.02597246437804365, + "flos": 26541971861760.0, + "grad_norm": 1.9901051216940264, + "language_loss": 0.85252273, + "learning_rate": 3.894098672541412e-06, + "loss": 0.87679577, + "num_input_tokens_seen": 4505935, + "step": 216, + "time_per_iteration": 2.6220383644104004 + }, + { + "auxiliary_loss_clip": 0.01364433, + "auxiliary_loss_mlp": 0.01066895, + "balance_loss_clip": 1.10046506, + "balance_loss_mlp": 1.04668856, + "epoch": 0.02609270726868274, + "flos": 32671671696000.0, + "grad_norm": 1.7493344947840754, + "language_loss": 0.75335574, + "learning_rate": 3.89744484916025e-06, + "loss": 0.77766901, + "num_input_tokens_seen": 4527045, + "step": 217, + "time_per_iteration": 2.65970516204834 + }, + { + "auxiliary_loss_clip": 0.01366411, + "auxiliary_loss_mlp": 0.01068807, + "balance_loss_clip": 1.10224271, + "balance_loss_mlp": 1.04861248, + "epoch": 0.02621295015932183, + "flos": 26243553669120.0, + "grad_norm": 2.3008581604823672, + "language_loss": 0.87159967, + "learning_rate": 3.900775641004673e-06, + "loss": 0.89595187, + "num_input_tokens_seen": 4546360, + "step": 218, + "time_per_iteration": 2.546806573867798 + }, + { + "auxiliary_loss_clip": 0.01371726, + "auxiliary_loss_mlp": 0.01074983, + "balance_loss_clip": 1.10462165, + "balance_loss_mlp": 1.05239296, + "epoch": 0.026333193049960922, + "flos": 42921402353280.0, + "grad_norm": 3.320789338224337, + "language_loss": 0.73600757, + "learning_rate": 3.904091188897156e-06, + "loss": 0.76047468, + "num_input_tokens_seen": 4565495, + "step": 219, + "time_per_iteration": 2.669745922088623 + }, + { + "auxiliary_loss_clip": 0.01362539, + "auxiliary_loss_mlp": 0.01073098, + "balance_loss_clip": 1.10027313, + "balance_loss_mlp": 1.05253482, + "epoch": 0.026453435940600012, + "flos": 17963846386560.0, + "grad_norm": 2.020720471020208, + "language_loss": 0.81937492, + "learning_rate": 3.90739163173548e-06, + "loss": 0.84373134, + "num_input_tokens_seen": 4583330, + "step": 220, + "time_per_iteration": 2.5275635719299316 + }, + { + "auxiliary_loss_clip": 0.01360372, + "auxiliary_loss_mlp": 0.01069607, + "balance_loss_clip": 1.09959698, + "balance_loss_mlp": 1.05024719, + "epoch": 0.026573678831239102, + "flos": 18984319776000.0, + "grad_norm": 2.9456030849177925, + "language_loss": 0.88444632, + "learning_rate": 3.910677106527646e-06, + "loss": 0.90874612, + "num_input_tokens_seen": 4600520, + "step": 221, + "time_per_iteration": 2.4854769706726074 + }, + { + "auxiliary_loss_clip": 0.01358389, + "auxiliary_loss_mlp": 0.01069227, + "balance_loss_clip": 1.09946918, + "balance_loss_mlp": 1.05108368, + "epoch": 0.026693921721878195, + "flos": 29241448634880.0, + "grad_norm": 2.5311777765044643, + "language_loss": 0.84321344, + "learning_rate": 3.913947748426004e-06, + "loss": 0.86748958, + "num_input_tokens_seen": 4617340, + "step": 222, + "time_per_iteration": 2.6192891597747803 + }, + { + "auxiliary_loss_clip": 0.01364779, + "auxiliary_loss_mlp": 0.01070756, + "balance_loss_clip": 1.10314584, + "balance_loss_mlp": 1.05221844, + "epoch": 0.026814164612517285, + "flos": 14128083797760.0, + "grad_norm": 2.854249974962943, + "language_loss": 0.76695639, + "learning_rate": 3.9172036907606136e-06, + "loss": 0.79131174, + "num_input_tokens_seen": 4630820, + "step": 223, + "time_per_iteration": 2.4835925102233887 + }, + { + "auxiliary_loss_clip": 0.01362118, + "auxiliary_loss_mlp": 0.01064354, + "balance_loss_clip": 1.09909868, + "balance_loss_mlp": 1.04522085, + "epoch": 0.026934407503156375, + "flos": 23511973115520.0, + "grad_norm": 1.8045871885028086, + "language_loss": 0.94935501, + "learning_rate": 3.920445065071855e-06, + "loss": 0.97361982, + "num_input_tokens_seen": 4651985, + "step": 224, + "time_per_iteration": 2.5434398651123047 + }, + { + "auxiliary_loss_clip": 0.01358694, + "auxiliary_loss_mlp": 0.0107522, + "balance_loss_clip": 1.09892631, + "balance_loss_mlp": 1.05577648, + "epoch": 0.027054650393795468, + "flos": 28950356816640.0, + "grad_norm": 2.3382265797310966, + "language_loss": 0.7993409, + "learning_rate": 3.923672001142322e-06, + "loss": 0.82368004, + "num_input_tokens_seen": 4672295, + "step": 225, + "time_per_iteration": 2.5866034030914307 + }, + { + "auxiliary_loss_clip": 0.01355271, + "auxiliary_loss_mlp": 0.01076068, + "balance_loss_clip": 1.09733748, + "balance_loss_mlp": 1.05650616, + "epoch": 0.027174893284434558, + "flos": 31431568596480.0, + "grad_norm": 4.077587836933165, + "language_loss": 0.84481573, + "learning_rate": 3.926884627027996e-06, + "loss": 0.86912912, + "num_input_tokens_seen": 4696065, + "step": 226, + "time_per_iteration": 2.692692518234253 + }, + { + "auxiliary_loss_clip": 0.01357076, + "auxiliary_loss_mlp": 0.01070759, + "balance_loss_clip": 1.09662664, + "balance_loss_mlp": 1.05248427, + "epoch": 0.027295136175073648, + "flos": 22054466949120.0, + "grad_norm": 2.107326939323988, + "language_loss": 0.77363896, + "learning_rate": 3.930083069088744e-06, + "loss": 0.79791731, + "num_input_tokens_seen": 4716065, + "step": 227, + "time_per_iteration": 2.503667116165161 + }, + { + "auxiliary_loss_clip": 0.01322995, + "auxiliary_loss_mlp": 0.01016696, + "balance_loss_clip": 1.1227802, + "balance_loss_mlp": 1.00658703, + "epoch": 0.02741537906571274, + "flos": 60800752972800.0, + "grad_norm": 0.9749441330164644, + "language_loss": 0.59297657, + "learning_rate": 3.933267452018137e-06, + "loss": 0.61637342, + "num_input_tokens_seen": 4775860, + "step": 228, + "time_per_iteration": 3.094857692718506 + }, + { + "auxiliary_loss_clip": 0.01355364, + "auxiliary_loss_mlp": 0.01062031, + "balance_loss_clip": 1.099015, + "balance_loss_mlp": 1.04311216, + "epoch": 0.02753562195635183, + "flos": 24606278910720.0, + "grad_norm": 2.208420289896797, + "language_loss": 0.8430475, + "learning_rate": 3.936437898872622e-06, + "loss": 0.86722136, + "num_input_tokens_seen": 4795835, + "step": 229, + "time_per_iteration": 2.54166841506958 + }, + { + "auxiliary_loss_clip": 0.01357199, + "auxiliary_loss_mlp": 0.0105513, + "balance_loss_clip": 1.09940708, + "balance_loss_mlp": 1.03717709, + "epoch": 0.02765586484699092, + "flos": 34094236907520.0, + "grad_norm": 3.2385977808700495, + "language_loss": 0.79844141, + "learning_rate": 3.9395945311000525e-06, + "loss": 0.8225646, + "num_input_tokens_seen": 4817460, + "step": 230, + "time_per_iteration": 2.6339735984802246 + }, + { + "auxiliary_loss_clip": 0.0135729, + "auxiliary_loss_mlp": 0.01069433, + "balance_loss_clip": 1.09914613, + "balance_loss_mlp": 1.05052686, + "epoch": 0.027776107737630014, + "flos": 14829922615680.0, + "grad_norm": 3.8376158589856786, + "language_loss": 0.90692508, + "learning_rate": 3.942737468567608e-06, + "loss": 0.93119228, + "num_input_tokens_seen": 4835475, + "step": 231, + "time_per_iteration": 2.497300863265991 + }, + { + "auxiliary_loss_clip": 0.01355252, + "auxiliary_loss_mlp": 0.01069152, + "balance_loss_clip": 1.09883857, + "balance_loss_mlp": 1.05079365, + "epoch": 0.027896350628269104, + "flos": 47920347066240.0, + "grad_norm": 2.0804334918992344, + "language_loss": 0.8606438, + "learning_rate": 3.9458668295891026e-06, + "loss": 0.88488781, + "num_input_tokens_seen": 4857760, + "step": 232, + "time_per_iteration": 3.556023120880127 + }, + { + "auxiliary_loss_clip": 0.01351136, + "auxiliary_loss_mlp": 0.0106241, + "balance_loss_clip": 1.09420633, + "balance_loss_mlp": 1.04269302, + "epoch": 0.028016593518908194, + "flos": 21684550734720.0, + "grad_norm": 2.3319487340446874, + "language_loss": 0.86540693, + "learning_rate": 3.948982730951712e-06, + "loss": 0.88954234, + "num_input_tokens_seen": 4875855, + "step": 233, + "time_per_iteration": 3.401474952697754 + }, + { + "auxiliary_loss_clip": 0.01354923, + "auxiliary_loss_mlp": 0.0106241, + "balance_loss_clip": 1.09745383, + "balance_loss_mlp": 1.04310989, + "epoch": 0.028136836409547287, + "flos": 18439483305600.0, + "grad_norm": 2.467211104804349, + "language_loss": 0.82109368, + "learning_rate": 3.9520852879421254e-06, + "loss": 0.84526706, + "num_input_tokens_seen": 4893200, + "step": 234, + "time_per_iteration": 3.2872390747070312 + }, + { + "auxiliary_loss_clip": 0.01349852, + "auxiliary_loss_mlp": 0.01065635, + "balance_loss_clip": 1.09696901, + "balance_loss_mlp": 1.04802775, + "epoch": 0.028257079300186377, + "flos": 31576934937600.0, + "grad_norm": 2.247225688848913, + "language_loss": 0.81712079, + "learning_rate": 3.955174614372137e-06, + "loss": 0.84127569, + "num_input_tokens_seen": 4912965, + "step": 235, + "time_per_iteration": 2.5721051692962646 + }, + { + "auxiliary_loss_clip": 0.01352424, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_clip": 1.09749591, + "balance_loss_mlp": 1.04923606, + "epoch": 0.028377322190825467, + "flos": 23513337832320.0, + "grad_norm": 2.3448512452873995, + "language_loss": 0.84337473, + "learning_rate": 3.9582508226037045e-06, + "loss": 0.8675797, + "num_input_tokens_seen": 4933105, + "step": 236, + "time_per_iteration": 2.538154363632202 + }, + { + "auxiliary_loss_clip": 0.01359055, + "auxiliary_loss_mlp": 0.01070912, + "balance_loss_clip": 1.09861803, + "balance_loss_mlp": 1.05113506, + "epoch": 0.02849756508146456, + "flos": 20479604071680.0, + "grad_norm": 2.440995852846626, + "language_loss": 0.94367313, + "learning_rate": 3.9613140235734636e-06, + "loss": 0.96797276, + "num_input_tokens_seen": 4950085, + "step": 237, + "time_per_iteration": 2.5157129764556885 + }, + { + "auxiliary_loss_clip": 0.01350154, + "auxiliary_loss_mlp": 0.01063875, + "balance_loss_clip": 1.09532285, + "balance_loss_mlp": 1.04457521, + "epoch": 0.02861780797210365, + "flos": 14283362292480.0, + "grad_norm": 1.992663850482671, + "language_loss": 0.80907106, + "learning_rate": 3.96436432681674e-06, + "loss": 0.83321142, + "num_input_tokens_seen": 4968075, + "step": 238, + "time_per_iteration": 2.4828989505767822 + }, + { + "auxiliary_loss_clip": 0.01349834, + "auxiliary_loss_mlp": 0.01069475, + "balance_loss_clip": 1.09542072, + "balance_loss_mlp": 1.05052102, + "epoch": 0.02873805086274274, + "flos": 25808532053760.0, + "grad_norm": 1.9908416034147012, + "language_loss": 0.88966727, + "learning_rate": 3.967401840491044e-06, + "loss": 0.91386038, + "num_input_tokens_seen": 4987355, + "step": 239, + "time_per_iteration": 2.5395402908325195 + }, + { + "auxiliary_loss_clip": 0.01347916, + "auxiliary_loss_mlp": 0.01063631, + "balance_loss_clip": 1.09742951, + "balance_loss_mlp": 1.04706097, + "epoch": 0.028858293753381833, + "flos": 17304238984320.0, + "grad_norm": 2.672957067561469, + "language_loss": 0.87890118, + "learning_rate": 3.97042667139909e-06, + "loss": 0.90301669, + "num_input_tokens_seen": 5004680, + "step": 240, + "time_per_iteration": 2.4862513542175293 + }, + { + "auxiliary_loss_clip": 0.01349318, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_clip": 1.09697223, + "balance_loss_mlp": 1.0407877, + "epoch": 0.028978536644020923, + "flos": 23038347358080.0, + "grad_norm": 2.174193489347373, + "language_loss": 0.87252986, + "learning_rate": 3.973438925011327e-06, + "loss": 0.89661103, + "num_input_tokens_seen": 5022965, + "step": 241, + "time_per_iteration": 2.523435115814209 + }, + { + "auxiliary_loss_clip": 0.01348963, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.09396458, + "balance_loss_mlp": 1.03308034, + "epoch": 0.029098779534660012, + "flos": 28329712692480.0, + "grad_norm": 9.429468356383522, + "language_loss": 0.91451728, + "learning_rate": 3.976438705488002e-06, + "loss": 0.93852377, + "num_input_tokens_seen": 5042625, + "step": 242, + "time_per_iteration": 2.5604324340820312 + }, + { + "auxiliary_loss_clip": 0.01346862, + "auxiliary_loss_mlp": 0.01061335, + "balance_loss_clip": 1.09626973, + "balance_loss_mlp": 1.04419231, + "epoch": 0.029219022425299106, + "flos": 13881665520000.0, + "grad_norm": 8.943295230343054, + "language_loss": 0.93283665, + "learning_rate": 3.9794261157007744e-06, + "loss": 0.9569186, + "num_input_tokens_seen": 5060380, + "step": 243, + "time_per_iteration": 2.5408549308776855 + }, + { + "auxiliary_loss_clip": 0.01352182, + "auxiliary_loss_mlp": 0.01058749, + "balance_loss_clip": 1.0975039, + "balance_loss_mlp": 1.03918707, + "epoch": 0.029339265315938196, + "flos": 19422501788160.0, + "grad_norm": 2.3217988013801363, + "language_loss": 0.85028291, + "learning_rate": 3.982401257253887e-06, + "loss": 0.87439227, + "num_input_tokens_seen": 5078720, + "step": 244, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01348208, + "auxiliary_loss_mlp": 0.0105766, + "balance_loss_clip": 1.0945996, + "balance_loss_mlp": 1.04029155, + "epoch": 0.029459508206577285, + "flos": 15669550005120.0, + "grad_norm": 2.3041590113838204, + "language_loss": 0.89937472, + "learning_rate": 3.985364230504893e-06, + "loss": 0.92343342, + "num_input_tokens_seen": 5096605, + "step": 245, + "time_per_iteration": 2.4814274311065674 + }, + { + "auxiliary_loss_clip": 0.01354748, + "auxiliary_loss_mlp": 0.01061326, + "balance_loss_clip": 1.10030985, + "balance_loss_mlp": 1.04437447, + "epoch": 0.02957975109721638, + "flos": 28220975245440.0, + "grad_norm": 2.013585364659552, + "language_loss": 0.84621513, + "learning_rate": 3.988315134584976e-06, + "loss": 0.87037593, + "num_input_tokens_seen": 5116285, + "step": 246, + "time_per_iteration": 2.5597591400146484 + }, + { + "auxiliary_loss_clip": 0.01352394, + "auxiliary_loss_mlp": 0.01068742, + "balance_loss_clip": 1.0980742, + "balance_loss_mlp": 1.05041969, + "epoch": 0.02969999398785547, + "flos": 24315869450880.0, + "grad_norm": 1.909379917416462, + "language_loss": 0.80429095, + "learning_rate": 3.991254067418851e-06, + "loss": 0.8285023, + "num_input_tokens_seen": 5136825, + "step": 247, + "time_per_iteration": 2.5331099033355713 + }, + { + "auxiliary_loss_clip": 0.01342871, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.09528852, + "balance_loss_mlp": 1.04994798, + "epoch": 0.02982023687849456, + "flos": 35078584193280.0, + "grad_norm": 2.4163010074261524, + "language_loss": 0.83139467, + "learning_rate": 3.994181125744254e-06, + "loss": 0.8554939, + "num_input_tokens_seen": 5158630, + "step": 248, + "time_per_iteration": 2.6819405555725098 + }, + { + "auxiliary_loss_clip": 0.01346289, + "auxiliary_loss_mlp": 0.01057225, + "balance_loss_clip": 1.09550762, + "balance_loss_mlp": 1.04008269, + "epoch": 0.02994047976913365, + "flos": 26177155378560.0, + "grad_norm": 2.3555214822289794, + "language_loss": 0.74047112, + "learning_rate": 3.99709640513106e-06, + "loss": 0.76450622, + "num_input_tokens_seen": 5179510, + "step": 249, + "time_per_iteration": 2.6884233951568604 + }, + { + "auxiliary_loss_clip": 0.01347535, + "auxiliary_loss_mlp": 0.01071898, + "balance_loss_clip": 1.09291124, + "balance_loss_mlp": 1.05293226, + "epoch": 0.03006072265977274, + "flos": 25625028447360.0, + "grad_norm": 2.3579285643860426, + "language_loss": 0.85599041, + "learning_rate": 4e-06, + "loss": 0.88018471, + "num_input_tokens_seen": 5199345, + "step": 250, + "time_per_iteration": 2.6508238315582275 + }, + { + "auxiliary_loss_clip": 0.01348462, + "auxiliary_loss_mlp": 0.01056314, + "balance_loss_clip": 1.09756136, + "balance_loss_mlp": 1.03913593, + "epoch": 0.03018096555041183, + "flos": 22127078292480.0, + "grad_norm": 6.856721845759342, + "language_loss": 0.88815314, + "learning_rate": 3.999999848300794e-06, + "loss": 0.91220093, + "num_input_tokens_seen": 5218330, + "step": 251, + "time_per_iteration": 2.561594009399414 + }, + { + "auxiliary_loss_clip": 0.01340508, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.09116793, + "balance_loss_mlp": 1.03841567, + "epoch": 0.030301208441050925, + "flos": 30188197359360.0, + "grad_norm": 1.6813670069668534, + "language_loss": 0.89249742, + "learning_rate": 3.999999393203203e-06, + "loss": 0.91646004, + "num_input_tokens_seen": 5240740, + "step": 252, + "time_per_iteration": 2.600980520248413 + }, + { + "auxiliary_loss_clip": 0.01340194, + "auxiliary_loss_mlp": 0.01058007, + "balance_loss_clip": 1.08968043, + "balance_loss_mlp": 1.04090023, + "epoch": 0.030421451331690014, + "flos": 23621392920960.0, + "grad_norm": 2.0489429878677483, + "language_loss": 0.85192108, + "learning_rate": 3.999998634707293e-06, + "loss": 0.87590307, + "num_input_tokens_seen": 5260290, + "step": 253, + "time_per_iteration": 2.5189058780670166 + }, + { + "auxiliary_loss_clip": 0.01349752, + "auxiliary_loss_mlp": 0.01063256, + "balance_loss_clip": 1.09779787, + "balance_loss_mlp": 1.04561341, + "epoch": 0.030541694222329104, + "flos": 27928446883200.0, + "grad_norm": 3.2957676838366092, + "language_loss": 0.96555686, + "learning_rate": 3.999997572813182e-06, + "loss": 0.98968697, + "num_input_tokens_seen": 5278100, + "step": 254, + "time_per_iteration": 2.553295373916626 + }, + { + "auxiliary_loss_clip": 0.01344141, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_clip": 1.09273148, + "balance_loss_mlp": 1.05239654, + "epoch": 0.030661937112968194, + "flos": 18588441006720.0, + "grad_norm": 1.8447062214383305, + "language_loss": 0.87863976, + "learning_rate": 3.999996207521028e-06, + "loss": 0.90277898, + "num_input_tokens_seen": 5296810, + "step": 255, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01346642, + "auxiliary_loss_mlp": 0.01058714, + "balance_loss_clip": 1.09154677, + "balance_loss_mlp": 1.03992653, + "epoch": 0.030782180003607287, + "flos": 12969139478400.0, + "grad_norm": 2.1470557493906837, + "language_loss": 0.82066244, + "learning_rate": 3.999994538831039e-06, + "loss": 0.84471607, + "num_input_tokens_seen": 5313395, + "step": 256, + "time_per_iteration": 2.5047779083251953 + }, + { + "auxiliary_loss_clip": 0.013442, + "auxiliary_loss_mlp": 0.01059467, + "balance_loss_clip": 1.09288216, + "balance_loss_mlp": 1.04066777, + "epoch": 0.030902422894246377, + "flos": 23335364920320.0, + "grad_norm": 2.4750030227833117, + "language_loss": 0.85999751, + "learning_rate": 3.99999256674347e-06, + "loss": 0.88403416, + "num_input_tokens_seen": 5333545, + "step": 257, + "time_per_iteration": 2.577547788619995 + }, + { + "auxiliary_loss_clip": 0.01287211, + "auxiliary_loss_mlp": 0.01008034, + "balance_loss_clip": 1.09815097, + "balance_loss_mlp": 0.99759179, + "epoch": 0.031022665784885467, + "flos": 55094151438720.0, + "grad_norm": 1.004923587208518, + "language_loss": 0.5349049, + "learning_rate": 3.999990291258618e-06, + "loss": 0.55785733, + "num_input_tokens_seen": 5392235, + "step": 258, + "time_per_iteration": 3.0803494453430176 + }, + { + "auxiliary_loss_clip": 0.01343399, + "auxiliary_loss_mlp": 0.01061069, + "balance_loss_clip": 1.09277654, + "balance_loss_mlp": 1.0433073, + "epoch": 0.03114290867552456, + "flos": 19317786664320.0, + "grad_norm": 2.3273693251046295, + "language_loss": 0.86488926, + "learning_rate": 3.999987712376829e-06, + "loss": 0.8889339, + "num_input_tokens_seen": 5410555, + "step": 259, + "time_per_iteration": 2.5353946685791016 + }, + { + "auxiliary_loss_clip": 0.01342276, + "auxiliary_loss_mlp": 0.01060603, + "balance_loss_clip": 1.09426641, + "balance_loss_mlp": 1.04290009, + "epoch": 0.031263151566163654, + "flos": 20959442881920.0, + "grad_norm": 2.1982399831404953, + "language_loss": 0.82236338, + "learning_rate": 3.999984830098494e-06, + "loss": 0.84639215, + "num_input_tokens_seen": 5430135, + "step": 260, + "time_per_iteration": 4.185324668884277 + }, + { + "auxiliary_loss_clip": 0.01339203, + "auxiliary_loss_mlp": 0.01063006, + "balance_loss_clip": 1.09065187, + "balance_loss_mlp": 1.04500532, + "epoch": 0.03138339445680274, + "flos": 14793006412800.0, + "grad_norm": 2.9356722304355114, + "language_loss": 0.98046136, + "learning_rate": 3.999981644424051e-06, + "loss": 1.00448346, + "num_input_tokens_seen": 5444935, + "step": 261, + "time_per_iteration": 3.1868677139282227 + }, + { + "auxiliary_loss_clip": 0.01341159, + "auxiliary_loss_mlp": 0.01069933, + "balance_loss_clip": 1.0940057, + "balance_loss_mlp": 1.05084765, + "epoch": 0.03150363734744183, + "flos": 11655599022720.0, + "grad_norm": 2.645400996478755, + "language_loss": 0.86089593, + "learning_rate": 3.999978155353982e-06, + "loss": 0.88500684, + "num_input_tokens_seen": 5462080, + "step": 262, + "time_per_iteration": 2.502997636795044 + }, + { + "auxiliary_loss_clip": 0.01338949, + "auxiliary_loss_mlp": 0.01063926, + "balance_loss_clip": 1.09073329, + "balance_loss_mlp": 1.04542446, + "epoch": 0.03162388023808092, + "flos": 33727732485120.0, + "grad_norm": 2.267351285021861, + "language_loss": 0.80341858, + "learning_rate": 3.9999743628888186e-06, + "loss": 0.8274473, + "num_input_tokens_seen": 5483870, + "step": 263, + "time_per_iteration": 2.6104564666748047 + }, + { + "auxiliary_loss_clip": 0.01332334, + "auxiliary_loss_mlp": 0.01060338, + "balance_loss_clip": 1.08722758, + "balance_loss_mlp": 1.0426352, + "epoch": 0.03174412312872001, + "flos": 20810952057600.0, + "grad_norm": 2.7662914793297864, + "language_loss": 0.89219964, + "learning_rate": 3.999970267029133e-06, + "loss": 0.91612631, + "num_input_tokens_seen": 5502830, + "step": 264, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.01335072, + "auxiliary_loss_mlp": 0.01055993, + "balance_loss_clip": 1.0904026, + "balance_loss_mlp": 1.03866029, + "epoch": 0.0318643660193591, + "flos": 23727939638400.0, + "grad_norm": 2.4709643370415564, + "language_loss": 0.80019975, + "learning_rate": 3.999965867775548e-06, + "loss": 0.82411039, + "num_input_tokens_seen": 5523225, + "step": 265, + "time_per_iteration": 2.5426151752471924 + }, + { + "auxiliary_loss_clip": 0.01337859, + "auxiliary_loss_mlp": 0.01066422, + "balance_loss_clip": 1.09038353, + "balance_loss_mlp": 1.04913712, + "epoch": 0.0319846089099982, + "flos": 13917863450880.0, + "grad_norm": 2.5103618653069684, + "language_loss": 0.86971104, + "learning_rate": 3.9999611651287315e-06, + "loss": 0.89375389, + "num_input_tokens_seen": 5541380, + "step": 266, + "time_per_iteration": 2.5159478187561035 + }, + { + "auxiliary_loss_clip": 0.01340565, + "auxiliary_loss_mlp": 0.01060924, + "balance_loss_clip": 1.09252357, + "balance_loss_mlp": 1.04366207, + "epoch": 0.03210485180063729, + "flos": 14753253035520.0, + "grad_norm": 2.393723200460593, + "language_loss": 0.78739327, + "learning_rate": 3.999956159089396e-06, + "loss": 0.81140816, + "num_input_tokens_seen": 5558830, + "step": 267, + "time_per_iteration": 2.4616403579711914 + }, + { + "auxiliary_loss_clip": 0.0133729, + "auxiliary_loss_mlp": 0.01065539, + "balance_loss_clip": 1.09130144, + "balance_loss_mlp": 1.04777694, + "epoch": 0.03222509469127638, + "flos": 28913153304960.0, + "grad_norm": 2.3866526384486466, + "language_loss": 0.79274267, + "learning_rate": 3.999950849658302e-06, + "loss": 0.81677091, + "num_input_tokens_seen": 5577750, + "step": 268, + "time_per_iteration": 2.5542900562286377 + }, + { + "auxiliary_loss_clip": 0.0134391, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_clip": 1.09357536, + "balance_loss_mlp": 1.05040216, + "epoch": 0.03234533758191547, + "flos": 16946389739520.0, + "grad_norm": 2.16188505563306, + "language_loss": 0.841959, + "learning_rate": 3.999945236836254e-06, + "loss": 0.86607569, + "num_input_tokens_seen": 5596715, + "step": 269, + "time_per_iteration": 2.50282883644104 + }, + { + "auxiliary_loss_clip": 0.0134402, + "auxiliary_loss_mlp": 0.01066178, + "balance_loss_clip": 1.09516478, + "balance_loss_mlp": 1.04737902, + "epoch": 0.03246558047255456, + "flos": 18989096284800.0, + "grad_norm": 2.4772982015118155, + "language_loss": 0.94841027, + "learning_rate": 3.999939320624103e-06, + "loss": 0.97251225, + "num_input_tokens_seen": 5611865, + "step": 270, + "time_per_iteration": 2.509799003601074 + }, + { + "auxiliary_loss_clip": 0.01341445, + "auxiliary_loss_mlp": 0.0106344, + "balance_loss_clip": 1.09389699, + "balance_loss_mlp": 1.04583311, + "epoch": 0.03258582336319365, + "flos": 23728334688000.0, + "grad_norm": 1.9824435272516618, + "language_loss": 0.90065706, + "learning_rate": 3.999933101022749e-06, + "loss": 0.92470586, + "num_input_tokens_seen": 5632270, + "step": 271, + "time_per_iteration": 2.596348285675049 + }, + { + "auxiliary_loss_clip": 0.01337975, + "auxiliary_loss_mlp": 0.01065526, + "balance_loss_clip": 1.09219527, + "balance_loss_mlp": 1.04808581, + "epoch": 0.032706066253832745, + "flos": 27670823562240.0, + "grad_norm": 2.134231135976669, + "language_loss": 0.86807013, + "learning_rate": 3.999926578033132e-06, + "loss": 0.8921051, + "num_input_tokens_seen": 5652085, + "step": 272, + "time_per_iteration": 2.589371919631958 + }, + { + "auxiliary_loss_clip": 0.01337107, + "auxiliary_loss_mlp": 0.01063806, + "balance_loss_clip": 1.08852267, + "balance_loss_mlp": 1.04598451, + "epoch": 0.032826309144471835, + "flos": 45624685968000.0, + "grad_norm": 1.9736985798727305, + "language_loss": 0.63150817, + "learning_rate": 3.999919751656244e-06, + "loss": 0.65551728, + "num_input_tokens_seen": 5678985, + "step": 273, + "time_per_iteration": 2.71744441986084 + }, + { + "auxiliary_loss_clip": 0.01333368, + "auxiliary_loss_mlp": 0.01057577, + "balance_loss_clip": 1.08777916, + "balance_loss_mlp": 1.03900385, + "epoch": 0.032946552035110925, + "flos": 25812374808960.0, + "grad_norm": 2.8679144539871495, + "language_loss": 0.75774014, + "learning_rate": 3.9999126218931195e-06, + "loss": 0.78164953, + "num_input_tokens_seen": 5697020, + "step": 274, + "time_per_iteration": 2.5234222412109375 + }, + { + "auxiliary_loss_clip": 0.01340223, + "auxiliary_loss_mlp": 0.01052168, + "balance_loss_clip": 1.09333205, + "balance_loss_mlp": 1.03462029, + "epoch": 0.033066794925750015, + "flos": 15121984101120.0, + "grad_norm": 2.1604332970218394, + "language_loss": 0.89828002, + "learning_rate": 3.99990518874484e-06, + "loss": 0.92220396, + "num_input_tokens_seen": 5713460, + "step": 275, + "time_per_iteration": 2.514495849609375 + }, + { + "auxiliary_loss_clip": 0.01337893, + "auxiliary_loss_mlp": 0.01068565, + "balance_loss_clip": 1.09233379, + "balance_loss_mlp": 1.05123186, + "epoch": 0.033187037816389105, + "flos": 22776593973120.0, + "grad_norm": 2.295405872491499, + "language_loss": 0.92337883, + "learning_rate": 3.999897452212534e-06, + "loss": 0.94744337, + "num_input_tokens_seen": 5730790, + "step": 276, + "time_per_iteration": 2.5258681774139404 + }, + { + "auxiliary_loss_clip": 0.01333459, + "auxiliary_loss_mlp": 0.01064446, + "balance_loss_clip": 1.0898875, + "balance_loss_mlp": 1.0465169, + "epoch": 0.033307280707028195, + "flos": 23331414424320.0, + "grad_norm": 2.170279915796394, + "language_loss": 1.00139761, + "learning_rate": 3.999889412297374e-06, + "loss": 1.02537668, + "num_input_tokens_seen": 5750215, + "step": 277, + "time_per_iteration": 2.5562474727630615 + }, + { + "auxiliary_loss_clip": 0.01333571, + "auxiliary_loss_mlp": 0.01043413, + "balance_loss_clip": 1.08807111, + "balance_loss_mlp": 1.02710533, + "epoch": 0.03342752359766729, + "flos": 28840290566400.0, + "grad_norm": 2.066377876317811, + "language_loss": 0.79029882, + "learning_rate": 3.999881069000581e-06, + "loss": 0.81406868, + "num_input_tokens_seen": 5769945, + "step": 278, + "time_per_iteration": 2.58473539352417 + }, + { + "auxiliary_loss_clip": 0.01335182, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_clip": 1.08848381, + "balance_loss_mlp": 1.03621721, + "epoch": 0.03354776648830638, + "flos": 19384544090880.0, + "grad_norm": 3.367739707248116, + "language_loss": 0.87200749, + "learning_rate": 3.99987242232342e-06, + "loss": 0.89590657, + "num_input_tokens_seen": 5784950, + "step": 279, + "time_per_iteration": 2.507972240447998 + }, + { + "auxiliary_loss_clip": 0.0133659, + "auxiliary_loss_mlp": 0.010664, + "balance_loss_clip": 1.09167504, + "balance_loss_mlp": 1.04811358, + "epoch": 0.03366800937894547, + "flos": 17858628472320.0, + "grad_norm": 1.9479442345584634, + "language_loss": 0.79613489, + "learning_rate": 3.9998634722672026e-06, + "loss": 0.8201648, + "num_input_tokens_seen": 5805005, + "step": 280, + "time_per_iteration": 2.526787757873535 + }, + { + "auxiliary_loss_clip": 0.01336722, + "auxiliary_loss_mlp": 0.010581, + "balance_loss_clip": 1.09226966, + "balance_loss_mlp": 1.04107666, + "epoch": 0.03378825226958456, + "flos": 35951033635200.0, + "grad_norm": 2.120507500201445, + "language_loss": 0.78565633, + "learning_rate": 3.999854218833286e-06, + "loss": 0.80960453, + "num_input_tokens_seen": 5825825, + "step": 281, + "time_per_iteration": 2.6457369327545166 + }, + { + "auxiliary_loss_clip": 0.01335434, + "auxiliary_loss_mlp": 0.01062124, + "balance_loss_clip": 1.09196496, + "balance_loss_mlp": 1.04435015, + "epoch": 0.03390849516022365, + "flos": 25702488126720.0, + "grad_norm": 2.24343246384953, + "language_loss": 0.81778347, + "learning_rate": 3.999844662023075e-06, + "loss": 0.84175909, + "num_input_tokens_seen": 5845700, + "step": 282, + "time_per_iteration": 2.545872688293457 + }, + { + "auxiliary_loss_clip": 0.0132775, + "auxiliary_loss_mlp": 0.01058647, + "balance_loss_clip": 1.08758342, + "balance_loss_mlp": 1.04140937, + "epoch": 0.03402873805086274, + "flos": 21284505987840.0, + "grad_norm": 1.8115762606009012, + "language_loss": 0.92019212, + "learning_rate": 3.999834801838018e-06, + "loss": 0.94405603, + "num_input_tokens_seen": 5864680, + "step": 283, + "time_per_iteration": 2.5256025791168213 + }, + { + "auxiliary_loss_clip": 0.01329167, + "auxiliary_loss_mlp": 0.01056436, + "balance_loss_clip": 1.08850718, + "balance_loss_mlp": 1.03924584, + "epoch": 0.03414898094150183, + "flos": 22710913954560.0, + "grad_norm": 1.8912749750533082, + "language_loss": 0.74029875, + "learning_rate": 3.9998246382796115e-06, + "loss": 0.76415479, + "num_input_tokens_seen": 5884260, + "step": 284, + "time_per_iteration": 2.5002176761627197 + }, + { + "auxiliary_loss_clip": 0.01333349, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.08690763, + "balance_loss_mlp": 1.03386402, + "epoch": 0.03426922383214093, + "flos": 18879927874560.0, + "grad_norm": 2.256044220588054, + "language_loss": 0.90905637, + "learning_rate": 3.999814171349399e-06, + "loss": 0.93291152, + "num_input_tokens_seen": 5902120, + "step": 285, + "time_per_iteration": 2.501950979232788 + }, + { + "auxiliary_loss_clip": 0.0132887, + "auxiliary_loss_mlp": 0.01055901, + "balance_loss_clip": 1.08813155, + "balance_loss_mlp": 1.03940225, + "epoch": 0.03438946672278002, + "flos": 34752012716160.0, + "grad_norm": 1.6021543378440304, + "language_loss": 0.73486495, + "learning_rate": 3.9998034010489655e-06, + "loss": 0.75871265, + "num_input_tokens_seen": 5925810, + "step": 286, + "time_per_iteration": 2.619114875793457 + }, + { + "auxiliary_loss_clip": 0.01328761, + "auxiliary_loss_mlp": 0.01061034, + "balance_loss_clip": 1.08955002, + "balance_loss_mlp": 1.04446411, + "epoch": 0.03450970961341911, + "flos": 22164102236160.0, + "grad_norm": 2.637695448636838, + "language_loss": 0.76006019, + "learning_rate": 3.999792327379946e-06, + "loss": 0.78395808, + "num_input_tokens_seen": 5945185, + "step": 287, + "time_per_iteration": 3.3553969860076904 + }, + { + "auxiliary_loss_clip": 0.01334865, + "auxiliary_loss_mlp": 0.01063927, + "balance_loss_clip": 1.09394956, + "balance_loss_mlp": 1.04707134, + "epoch": 0.034629952504058197, + "flos": 21725740656000.0, + "grad_norm": 2.1228582628350092, + "language_loss": 0.96314919, + "learning_rate": 3.999780950344021e-06, + "loss": 0.98713708, + "num_input_tokens_seen": 5963375, + "step": 288, + "time_per_iteration": 3.356580972671509 + }, + { + "auxiliary_loss_clip": 0.01336544, + "auxiliary_loss_mlp": 0.01065429, + "balance_loss_clip": 1.09137404, + "balance_loss_mlp": 1.04733264, + "epoch": 0.034750195394697286, + "flos": 20047994248320.0, + "grad_norm": 1.923355404044138, + "language_loss": 0.82856649, + "learning_rate": 3.999769269942916e-06, + "loss": 0.85258615, + "num_input_tokens_seen": 5983415, + "step": 289, + "time_per_iteration": 3.273599863052368 + }, + { + "auxiliary_loss_clip": 0.01329649, + "auxiliary_loss_mlp": 0.01055401, + "balance_loss_clip": 1.08848834, + "balance_loss_mlp": 1.03786576, + "epoch": 0.034870438285336376, + "flos": 27965865876480.0, + "grad_norm": 1.7660963300444175, + "language_loss": 0.8114922, + "learning_rate": 3.999757286178402e-06, + "loss": 0.83534265, + "num_input_tokens_seen": 6005850, + "step": 290, + "time_per_iteration": 2.571743965148926 + }, + { + "auxiliary_loss_clip": 0.01332818, + "auxiliary_loss_mlp": 0.01050877, + "balance_loss_clip": 1.09096825, + "balance_loss_mlp": 1.03334117, + "epoch": 0.03499068117597547, + "flos": 22017514832640.0, + "grad_norm": 2.817466840867797, + "language_loss": 0.90789264, + "learning_rate": 3.999744999052299e-06, + "loss": 0.93172961, + "num_input_tokens_seen": 6027240, + "step": 291, + "time_per_iteration": 2.5321621894836426 + }, + { + "auxiliary_loss_clip": 0.0127556, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.09456384, + "balance_loss_mlp": 1.02276409, + "epoch": 0.03511092406661456, + "flos": 57242147725440.0, + "grad_norm": 0.9579212115780174, + "language_loss": 0.61122328, + "learning_rate": 3.9997324085664675e-06, + "loss": 0.6343081, + "num_input_tokens_seen": 6087470, + "step": 292, + "time_per_iteration": 3.0541954040527344 + }, + { + "auxiliary_loss_clip": 0.0132714, + "auxiliary_loss_mlp": 0.01057525, + "balance_loss_clip": 1.08639526, + "balance_loss_mlp": 1.03988171, + "epoch": 0.03523116695725365, + "flos": 22928065626240.0, + "grad_norm": 2.8206649418372063, + "language_loss": 0.92031407, + "learning_rate": 3.999719514722821e-06, + "loss": 0.94416064, + "num_input_tokens_seen": 6107600, + "step": 293, + "time_per_iteration": 2.5209949016571045 + }, + { + "auxiliary_loss_clip": 0.01324131, + "auxiliary_loss_mlp": 0.0105454, + "balance_loss_clip": 1.08604956, + "balance_loss_mlp": 1.0381012, + "epoch": 0.03535140984789274, + "flos": 36903241226880.0, + "grad_norm": 2.9812749147255326, + "language_loss": 0.74878967, + "learning_rate": 3.999706317523314e-06, + "loss": 0.77257627, + "num_input_tokens_seen": 6126160, + "step": 294, + "time_per_iteration": 2.658825635910034 + }, + { + "auxiliary_loss_clip": 0.01325559, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_clip": 1.08720243, + "balance_loss_mlp": 1.03529596, + "epoch": 0.03547165273853183, + "flos": 20449152316800.0, + "grad_norm": 2.0086204547311945, + "language_loss": 0.85961372, + "learning_rate": 3.999692816969948e-06, + "loss": 0.88338405, + "num_input_tokens_seen": 6145695, + "step": 295, + "time_per_iteration": 2.555886745452881 + }, + { + "auxiliary_loss_clip": 0.01264517, + "auxiliary_loss_mlp": 0.01014689, + "balance_loss_clip": 1.08577347, + "balance_loss_mlp": 1.00429428, + "epoch": 0.03559189562917092, + "flos": 69850564871040.0, + "grad_norm": 0.9962205863226998, + "language_loss": 0.69369656, + "learning_rate": 3.999679013064772e-06, + "loss": 0.71648866, + "num_input_tokens_seen": 6212440, + "step": 296, + "time_per_iteration": 3.1484200954437256 + }, + { + "auxiliary_loss_clip": 0.01328386, + "auxiliary_loss_mlp": 0.01055972, + "balance_loss_clip": 1.08866513, + "balance_loss_mlp": 1.03918695, + "epoch": 0.03571213851981002, + "flos": 21651944163840.0, + "grad_norm": 2.7196133241091807, + "language_loss": 0.85931814, + "learning_rate": 3.99966490580988e-06, + "loss": 0.88316172, + "num_input_tokens_seen": 6229800, + "step": 297, + "time_per_iteration": 2.5340359210968018 + }, + { + "auxiliary_loss_clip": 0.01329534, + "auxiliary_loss_mlp": 0.01061466, + "balance_loss_clip": 1.0877459, + "balance_loss_mlp": 1.04449058, + "epoch": 0.03583238141044911, + "flos": 43945610757120.0, + "grad_norm": 2.2396501534292472, + "language_loss": 0.65846479, + "learning_rate": 3.999650495207411e-06, + "loss": 0.68237484, + "num_input_tokens_seen": 6255825, + "step": 298, + "time_per_iteration": 2.708862543106079 + }, + { + "auxiliary_loss_clip": 0.01322169, + "auxiliary_loss_mlp": 0.01058964, + "balance_loss_clip": 1.08597827, + "balance_loss_mlp": 1.0416193, + "epoch": 0.0359526243010882, + "flos": 18910810592640.0, + "grad_norm": 3.2348747223661514, + "language_loss": 0.90328115, + "learning_rate": 3.999635781259553e-06, + "loss": 0.92709249, + "num_input_tokens_seen": 6271090, + "step": 299, + "time_per_iteration": 2.509911298751831 + }, + { + "auxiliary_loss_clip": 0.0124728, + "auxiliary_loss_mlp": 0.01009398, + "balance_loss_clip": 1.07158875, + "balance_loss_mlp": 0.99881184, + "epoch": 0.03607286719172729, + "flos": 61668892782720.0, + "grad_norm": 0.9174435658753497, + "language_loss": 0.5222618, + "learning_rate": 3.999620763968535e-06, + "loss": 0.54482859, + "num_input_tokens_seen": 6329965, + "step": 300, + "time_per_iteration": 2.985281467437744 + }, + { + "auxiliary_loss_clip": 0.01322019, + "auxiliary_loss_mlp": 0.01054684, + "balance_loss_clip": 1.08697486, + "balance_loss_mlp": 1.0374819, + "epoch": 0.03619311008236638, + "flos": 27819062991360.0, + "grad_norm": 1.6924198064025273, + "language_loss": 0.86637765, + "learning_rate": 3.999605443336638e-06, + "loss": 0.89014471, + "num_input_tokens_seen": 6352095, + "step": 301, + "time_per_iteration": 2.6531150341033936 + }, + { + "auxiliary_loss_clip": 0.01328685, + "auxiliary_loss_mlp": 0.01060367, + "balance_loss_clip": 1.08937466, + "balance_loss_mlp": 1.04327273, + "epoch": 0.03631335297300547, + "flos": 13621133197440.0, + "grad_norm": 2.5111007791976414, + "language_loss": 0.8970139, + "learning_rate": 3.999589819366185e-06, + "loss": 0.9209044, + "num_input_tokens_seen": 6365885, + "step": 302, + "time_per_iteration": 2.500898599624634 + }, + { + "auxiliary_loss_clip": 0.01328616, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_clip": 1.08870649, + "balance_loss_mlp": 1.03904057, + "epoch": 0.036433595863644565, + "flos": 27631788456960.0, + "grad_norm": 2.09520069352677, + "language_loss": 0.84832335, + "learning_rate": 3.999573892059547e-06, + "loss": 0.87217796, + "num_input_tokens_seen": 6385015, + "step": 303, + "time_per_iteration": 2.5832653045654297 + }, + { + "auxiliary_loss_clip": 0.01331332, + "auxiliary_loss_mlp": 0.0106247, + "balance_loss_clip": 1.08946693, + "balance_loss_mlp": 1.04401696, + "epoch": 0.036553838754283655, + "flos": 24572020314240.0, + "grad_norm": 2.161686598713949, + "language_loss": 0.81187534, + "learning_rate": 3.999557661419138e-06, + "loss": 0.8358134, + "num_input_tokens_seen": 6405165, + "step": 304, + "time_per_iteration": 2.521466016769409 + }, + { + "auxiliary_loss_clip": 0.01329276, + "auxiliary_loss_mlp": 0.01057483, + "balance_loss_clip": 1.09052372, + "balance_loss_mlp": 1.04122305, + "epoch": 0.036674081644922744, + "flos": 23404313076480.0, + "grad_norm": 1.9459934439852062, + "language_loss": 0.81442571, + "learning_rate": 3.9995411274474225e-06, + "loss": 0.83829331, + "num_input_tokens_seen": 6424445, + "step": 305, + "time_per_iteration": 2.5334625244140625 + }, + { + "auxiliary_loss_clip": 0.01327331, + "auxiliary_loss_mlp": 0.01065835, + "balance_loss_clip": 1.08737397, + "balance_loss_mlp": 1.047858, + "epoch": 0.036794324535561834, + "flos": 27489690253440.0, + "grad_norm": 2.0954074933042506, + "language_loss": 0.81599385, + "learning_rate": 3.999524290146908e-06, + "loss": 0.83992547, + "num_input_tokens_seen": 6444650, + "step": 306, + "time_per_iteration": 2.6260616779327393 + }, + { + "auxiliary_loss_clip": 0.01325214, + "auxiliary_loss_mlp": 0.01066193, + "balance_loss_clip": 1.08915496, + "balance_loss_mlp": 1.04903829, + "epoch": 0.036914567426200924, + "flos": 19463476227840.0, + "grad_norm": 2.635763558910293, + "language_loss": 0.92800236, + "learning_rate": 3.9995071495201485e-06, + "loss": 0.95191634, + "num_input_tokens_seen": 6461755, + "step": 307, + "time_per_iteration": 2.5212206840515137 + }, + { + "auxiliary_loss_clip": 0.01325563, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.08881021, + "balance_loss_mlp": 1.04326475, + "epoch": 0.037034810316840014, + "flos": 22309324922880.0, + "grad_norm": 2.9360639537587425, + "language_loss": 0.97812188, + "learning_rate": 3.999489705569744e-06, + "loss": 1.0019896, + "num_input_tokens_seen": 6479455, + "step": 308, + "time_per_iteration": 2.5483994483947754 + }, + { + "auxiliary_loss_clip": 0.0132289, + "auxiliary_loss_mlp": 0.01058299, + "balance_loss_clip": 1.08516765, + "balance_loss_mlp": 1.04157436, + "epoch": 0.03715505320747911, + "flos": 18588333265920.0, + "grad_norm": 2.6182183455842285, + "language_loss": 0.86391532, + "learning_rate": 3.999471958298341e-06, + "loss": 0.88772726, + "num_input_tokens_seen": 6498365, + "step": 309, + "time_per_iteration": 2.507850408554077 + }, + { + "auxiliary_loss_clip": 0.01329665, + "auxiliary_loss_mlp": 0.01067838, + "balance_loss_clip": 1.09073079, + "balance_loss_mlp": 1.04976583, + "epoch": 0.0372752960981182, + "flos": 35955343267200.0, + "grad_norm": 2.0334745302381405, + "language_loss": 0.76096392, + "learning_rate": 3.999453907708631e-06, + "loss": 0.78493893, + "num_input_tokens_seen": 6520770, + "step": 310, + "time_per_iteration": 2.6196281909942627 + }, + { + "auxiliary_loss_clip": 0.01326735, + "auxiliary_loss_mlp": 0.01053964, + "balance_loss_clip": 1.08909392, + "balance_loss_mlp": 1.03751278, + "epoch": 0.03739553898875729, + "flos": 20814040627200.0, + "grad_norm": 1.7142251968670252, + "language_loss": 0.81299865, + "learning_rate": 3.999435553803353e-06, + "loss": 0.83680564, + "num_input_tokens_seen": 6540170, + "step": 311, + "time_per_iteration": 2.539167642593384 + }, + { + "auxiliary_loss_clip": 0.01323714, + "auxiliary_loss_mlp": 0.01061315, + "balance_loss_clip": 1.0877887, + "balance_loss_mlp": 1.04450667, + "epoch": 0.03751578187939638, + "flos": 20264140339200.0, + "grad_norm": 2.3178425497429402, + "language_loss": 0.8306613, + "learning_rate": 3.999416896585292e-06, + "loss": 0.85451156, + "num_input_tokens_seen": 6557200, + "step": 312, + "time_per_iteration": 2.4963958263397217 + }, + { + "auxiliary_loss_clip": 0.01325557, + "auxiliary_loss_mlp": 0.01055698, + "balance_loss_clip": 1.08774781, + "balance_loss_mlp": 1.0384481, + "epoch": 0.03763602477003547, + "flos": 20668063754880.0, + "grad_norm": 7.972621094044825, + "language_loss": 0.85312146, + "learning_rate": 3.9993979360572775e-06, + "loss": 0.87693405, + "num_input_tokens_seen": 6577340, + "step": 313, + "time_per_iteration": 3.361898422241211 + }, + { + "auxiliary_loss_clip": 0.01332534, + "auxiliary_loss_mlp": 0.0106335, + "balance_loss_clip": 1.09138894, + "balance_loss_mlp": 1.04605293, + "epoch": 0.03775626766067456, + "flos": 16691352197760.0, + "grad_norm": 2.9572263982059153, + "language_loss": 0.8326993, + "learning_rate": 3.999378672222185e-06, + "loss": 0.8566581, + "num_input_tokens_seen": 6595125, + "step": 314, + "time_per_iteration": 2.488062620162964 + }, + { + "auxiliary_loss_clip": 0.01327943, + "auxiliary_loss_mlp": 0.01056861, + "balance_loss_clip": 1.09060633, + "balance_loss_mlp": 1.03814507, + "epoch": 0.03787651055131366, + "flos": 21141797253120.0, + "grad_norm": 2.348276936918884, + "language_loss": 0.82739699, + "learning_rate": 3.9993591050829385e-06, + "loss": 0.85124505, + "num_input_tokens_seen": 6612990, + "step": 315, + "time_per_iteration": 4.209430932998657 + }, + { + "auxiliary_loss_clip": 0.01327291, + "auxiliary_loss_mlp": 0.01065795, + "balance_loss_clip": 1.09015453, + "balance_loss_mlp": 1.04814053, + "epoch": 0.037996753441952746, + "flos": 22018089450240.0, + "grad_norm": 2.3789878680614844, + "language_loss": 0.79256058, + "learning_rate": 3.999339234642506e-06, + "loss": 0.81649148, + "num_input_tokens_seen": 6632740, + "step": 316, + "time_per_iteration": 2.517179489135742 + }, + { + "auxiliary_loss_clip": 0.01328657, + "auxiliary_loss_mlp": 0.01050433, + "balance_loss_clip": 1.09153104, + "balance_loss_mlp": 1.03221774, + "epoch": 0.038116996332591836, + "flos": 27709391790720.0, + "grad_norm": 3.3013038311176386, + "language_loss": 0.83545929, + "learning_rate": 3.9993190609038994e-06, + "loss": 0.85925019, + "num_input_tokens_seen": 6651505, + "step": 317, + "time_per_iteration": 3.337282419204712 + }, + { + "auxiliary_loss_clip": 0.01319379, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_clip": 1.08588779, + "balance_loss_mlp": 1.03243613, + "epoch": 0.038237239223230926, + "flos": 21178067011200.0, + "grad_norm": 2.0830086482639034, + "language_loss": 0.833009, + "learning_rate": 3.999298583870182e-06, + "loss": 0.85670054, + "num_input_tokens_seen": 6671090, + "step": 318, + "time_per_iteration": 2.5150322914123535 + }, + { + "auxiliary_loss_clip": 0.01322628, + "auxiliary_loss_mlp": 0.01057801, + "balance_loss_clip": 1.08708167, + "balance_loss_mlp": 1.04056382, + "epoch": 0.038357482113870016, + "flos": 25556618995200.0, + "grad_norm": 2.7204237437115717, + "language_loss": 0.77655065, + "learning_rate": 3.999277803544458e-06, + "loss": 0.80035496, + "num_input_tokens_seen": 6691245, + "step": 319, + "time_per_iteration": 2.5506980419158936 + }, + { + "auxiliary_loss_clip": 0.01230239, + "auxiliary_loss_mlp": 0.01019467, + "balance_loss_clip": 1.0673337, + "balance_loss_mlp": 1.01012063, + "epoch": 0.038477725004509106, + "flos": 59227578034560.0, + "grad_norm": 0.9630927809641395, + "language_loss": 0.6238597, + "learning_rate": 3.999256719929882e-06, + "loss": 0.64635676, + "num_input_tokens_seen": 6752520, + "step": 320, + "time_per_iteration": 3.092270612716675 + }, + { + "auxiliary_loss_clip": 0.01228995, + "auxiliary_loss_mlp": 0.01011204, + "balance_loss_clip": 1.06638336, + "balance_loss_mlp": 1.00195336, + "epoch": 0.0385979678951482, + "flos": 67317676398720.0, + "grad_norm": 1.2175475917935272, + "language_loss": 0.67108715, + "learning_rate": 3.999235333029651e-06, + "loss": 0.69348913, + "num_input_tokens_seen": 6806460, + "step": 321, + "time_per_iteration": 2.9851176738739014 + }, + { + "auxiliary_loss_clip": 0.01322092, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.08937657, + "balance_loss_mlp": 1.04152548, + "epoch": 0.03871821078578729, + "flos": 22746752749440.0, + "grad_norm": 1.8569989273025194, + "language_loss": 0.81926775, + "learning_rate": 3.999213642847009e-06, + "loss": 0.84307146, + "num_input_tokens_seen": 6827045, + "step": 322, + "time_per_iteration": 2.5512428283691406 + }, + { + "auxiliary_loss_clip": 0.0132256, + "auxiliary_loss_mlp": 0.01057646, + "balance_loss_clip": 1.08720958, + "balance_loss_mlp": 1.04084945, + "epoch": 0.03883845367642638, + "flos": 26280613526400.0, + "grad_norm": 1.7577695843165855, + "language_loss": 0.91229081, + "learning_rate": 3.999191649385247e-06, + "loss": 0.93609285, + "num_input_tokens_seen": 6848220, + "step": 323, + "time_per_iteration": 2.6603574752807617 + }, + { + "auxiliary_loss_clip": 0.0122134, + "auxiliary_loss_mlp": 0.01009497, + "balance_loss_clip": 1.06197786, + "balance_loss_mlp": 1.00077116, + "epoch": 0.03895869656706547, + "flos": 56962835568000.0, + "grad_norm": 0.906988018940969, + "language_loss": 0.59800816, + "learning_rate": 3.999169352647702e-06, + "loss": 0.62031662, + "num_input_tokens_seen": 6909400, + "step": 324, + "time_per_iteration": 2.9916763305664062 + }, + { + "auxiliary_loss_clip": 0.01323623, + "auxiliary_loss_mlp": 0.01081837, + "balance_loss_clip": 1.08758521, + "balance_loss_mlp": 1.06322801, + "epoch": 0.03907893945770456, + "flos": 24863363527680.0, + "grad_norm": 1.8507078007250286, + "language_loss": 0.83109725, + "learning_rate": 3.999146752637755e-06, + "loss": 0.85515183, + "num_input_tokens_seen": 6930445, + "step": 325, + "time_per_iteration": 2.549497604370117 + }, + { + "auxiliary_loss_clip": 0.01322288, + "auxiliary_loss_mlp": 0.01060797, + "balance_loss_clip": 1.08705592, + "balance_loss_mlp": 1.04311848, + "epoch": 0.03919918234834365, + "flos": 18368595815040.0, + "grad_norm": 2.8756009460010863, + "language_loss": 0.89516294, + "learning_rate": 3.999123849358836e-06, + "loss": 0.91899383, + "num_input_tokens_seen": 6948110, + "step": 326, + "time_per_iteration": 2.487290382385254 + }, + { + "auxiliary_loss_clip": 0.01322495, + "auxiliary_loss_mlp": 0.01065955, + "balance_loss_clip": 1.08741999, + "balance_loss_mlp": 1.04758453, + "epoch": 0.03931942523898275, + "flos": 25225414663680.0, + "grad_norm": 2.2656996093550297, + "language_loss": 0.74592853, + "learning_rate": 3.999100642814418e-06, + "loss": 0.76981294, + "num_input_tokens_seen": 6968550, + "step": 327, + "time_per_iteration": 2.5387938022613525 + }, + { + "auxiliary_loss_clip": 0.01321406, + "auxiliary_loss_mlp": 0.01061238, + "balance_loss_clip": 1.08763158, + "balance_loss_mlp": 1.04345202, + "epoch": 0.03943966812962184, + "flos": 23257905240960.0, + "grad_norm": 2.3694148212369535, + "language_loss": 0.88497484, + "learning_rate": 3.999077133008022e-06, + "loss": 0.90880126, + "num_input_tokens_seen": 6987135, + "step": 328, + "time_per_iteration": 2.5143589973449707 + }, + { + "auxiliary_loss_clip": 0.0132301, + "auxiliary_loss_mlp": 0.01064153, + "balance_loss_clip": 1.08795619, + "balance_loss_mlp": 1.04462671, + "epoch": 0.03955991102026093, + "flos": 29168837291520.0, + "grad_norm": 2.2706797785465076, + "language_loss": 0.90551168, + "learning_rate": 3.9990533199432145e-06, + "loss": 0.92938328, + "num_input_tokens_seen": 7008630, + "step": 329, + "time_per_iteration": 2.554593563079834 + }, + { + "auxiliary_loss_clip": 0.01320879, + "auxiliary_loss_mlp": 0.01057919, + "balance_loss_clip": 1.08672369, + "balance_loss_mlp": 1.04010868, + "epoch": 0.03968015391090002, + "flos": 17602441695360.0, + "grad_norm": 2.5874890831835398, + "language_loss": 0.7574122, + "learning_rate": 3.999029203623608e-06, + "loss": 0.78120023, + "num_input_tokens_seen": 7026350, + "step": 330, + "time_per_iteration": 2.4613447189331055 + }, + { + "auxiliary_loss_clip": 0.01317444, + "auxiliary_loss_mlp": 0.01055333, + "balance_loss_clip": 1.08607543, + "balance_loss_mlp": 1.03749931, + "epoch": 0.03980039680153911, + "flos": 21799285752960.0, + "grad_norm": 2.173477655795099, + "language_loss": 0.86607158, + "learning_rate": 3.99900478405286e-06, + "loss": 0.88979924, + "num_input_tokens_seen": 7045660, + "step": 331, + "time_per_iteration": 2.490781545639038 + }, + { + "auxiliary_loss_clip": 0.01318404, + "auxiliary_loss_mlp": 0.01061005, + "balance_loss_clip": 1.08941865, + "balance_loss_mlp": 1.04522133, + "epoch": 0.0399206396921782, + "flos": 15195134148480.0, + "grad_norm": 2.5433866667122227, + "language_loss": 0.82356119, + "learning_rate": 3.998980061234676e-06, + "loss": 0.84735525, + "num_input_tokens_seen": 7063575, + "step": 332, + "time_per_iteration": 2.4944329261779785 + }, + { + "auxiliary_loss_clip": 0.01325771, + "auxiliary_loss_mlp": 0.01053483, + "balance_loss_clip": 1.08843422, + "balance_loss_mlp": 1.03524363, + "epoch": 0.040040882582817294, + "flos": 14422910630400.0, + "grad_norm": 2.6891676369336652, + "language_loss": 0.75439119, + "learning_rate": 3.9989550351728055e-06, + "loss": 0.77818376, + "num_input_tokens_seen": 7080505, + "step": 333, + "time_per_iteration": 2.4763119220733643 + }, + { + "auxiliary_loss_clip": 0.01319959, + "auxiliary_loss_mlp": 0.01056229, + "balance_loss_clip": 1.08849204, + "balance_loss_mlp": 1.0393132, + "epoch": 0.040161125473456384, + "flos": 19280906375040.0, + "grad_norm": 2.4471815199456306, + "language_loss": 0.84606612, + "learning_rate": 3.998929705871046e-06, + "loss": 0.86982799, + "num_input_tokens_seen": 7097860, + "step": 334, + "time_per_iteration": 2.4974420070648193 + }, + { + "auxiliary_loss_clip": 0.01318843, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.08930421, + "balance_loss_mlp": 1.04095018, + "epoch": 0.040281368364095474, + "flos": 17821101738240.0, + "grad_norm": 2.692403906654916, + "language_loss": 0.88927627, + "learning_rate": 3.99890407333324e-06, + "loss": 0.91304421, + "num_input_tokens_seen": 7116390, + "step": 335, + "time_per_iteration": 2.4892184734344482 + }, + { + "auxiliary_loss_clip": 0.01315432, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.08302426, + "balance_loss_mlp": 1.03863895, + "epoch": 0.040401611254734564, + "flos": 19573757959680.0, + "grad_norm": 1.9567657765697963, + "language_loss": 0.87152272, + "learning_rate": 3.998878137563275e-06, + "loss": 0.89524102, + "num_input_tokens_seen": 7135940, + "step": 336, + "time_per_iteration": 2.5106372833251953 + }, + { + "auxiliary_loss_clip": 0.01317852, + "auxiliary_loss_mlp": 0.0105336, + "balance_loss_clip": 1.08562124, + "balance_loss_mlp": 1.03564513, + "epoch": 0.040521854145373654, + "flos": 22054466949120.0, + "grad_norm": 1.943193987780698, + "language_loss": 0.85139, + "learning_rate": 3.998851898565085e-06, + "loss": 0.8751021, + "num_input_tokens_seen": 7155745, + "step": 337, + "time_per_iteration": 2.5173726081848145 + }, + { + "auxiliary_loss_clip": 0.01314726, + "auxiliary_loss_mlp": 0.01048657, + "balance_loss_clip": 1.08427358, + "balance_loss_mlp": 1.03190768, + "epoch": 0.04064209703601274, + "flos": 22674644196480.0, + "grad_norm": 2.1707250638736135, + "language_loss": 0.82978356, + "learning_rate": 3.998825356342653e-06, + "loss": 0.85341734, + "num_input_tokens_seen": 7175920, + "step": 338, + "time_per_iteration": 2.5226211547851562 + }, + { + "auxiliary_loss_clip": 0.0131692, + "auxiliary_loss_mlp": 0.01064318, + "balance_loss_clip": 1.0839572, + "balance_loss_mlp": 1.04700851, + "epoch": 0.04076233992665183, + "flos": 38582172783360.0, + "grad_norm": 2.348032330359331, + "language_loss": 0.73048615, + "learning_rate": 3.998798510900003e-06, + "loss": 0.75429857, + "num_input_tokens_seen": 7198720, + "step": 339, + "time_per_iteration": 2.6424903869628906 + }, + { + "auxiliary_loss_clip": 0.01317186, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.08507109, + "balance_loss_mlp": 1.03671885, + "epoch": 0.04088258281729093, + "flos": 25885309374720.0, + "grad_norm": 4.171316213342774, + "language_loss": 0.83692461, + "learning_rate": 3.998771362241207e-06, + "loss": 0.86063612, + "num_input_tokens_seen": 7219125, + "step": 340, + "time_per_iteration": 2.5377330780029297 + }, + { + "auxiliary_loss_clip": 0.01311549, + "auxiliary_loss_mlp": 0.01054975, + "balance_loss_clip": 1.08336401, + "balance_loss_mlp": 1.03810692, + "epoch": 0.04100282570793002, + "flos": 19789832223360.0, + "grad_norm": 1.8624021442822696, + "language_loss": 0.876616, + "learning_rate": 3.998743910370385e-06, + "loss": 0.90028119, + "num_input_tokens_seen": 7237985, + "step": 341, + "time_per_iteration": 3.3244218826293945 + }, + { + "auxiliary_loss_clip": 0.01322695, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.09355974, + "balance_loss_mlp": 1.03086543, + "epoch": 0.04112306859856911, + "flos": 22565152563840.0, + "grad_norm": 2.0866266673048233, + "language_loss": 0.73385412, + "learning_rate": 3.998716155291702e-06, + "loss": 0.75757676, + "num_input_tokens_seen": 7255825, + "step": 342, + "time_per_iteration": 3.295494556427002 + }, + { + "auxiliary_loss_clip": 0.01316787, + "auxiliary_loss_mlp": 0.0105937, + "balance_loss_clip": 1.08823419, + "balance_loss_mlp": 1.04179811, + "epoch": 0.0412433114892082, + "flos": 25040654081280.0, + "grad_norm": 1.7813420202434957, + "language_loss": 0.90508056, + "learning_rate": 3.998688097009366e-06, + "loss": 0.92884207, + "num_input_tokens_seen": 7276590, + "step": 343, + "time_per_iteration": 3.392993688583374 + }, + { + "auxiliary_loss_clip": 0.01317468, + "auxiliary_loss_mlp": 0.01054606, + "balance_loss_clip": 1.08683801, + "balance_loss_mlp": 1.03829789, + "epoch": 0.04136355437984729, + "flos": 25191371548800.0, + "grad_norm": 2.13551725510914, + "language_loss": 0.80172688, + "learning_rate": 3.998659735527636e-06, + "loss": 0.82544762, + "num_input_tokens_seen": 7295680, + "step": 344, + "time_per_iteration": 3.27579665184021 + }, + { + "auxiliary_loss_clip": 0.01314617, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.08498883, + "balance_loss_mlp": 1.03670919, + "epoch": 0.04148379727048638, + "flos": 22966777509120.0, + "grad_norm": 1.8327557282777152, + "language_loss": 0.77637661, + "learning_rate": 3.998631070850813e-06, + "loss": 0.80006313, + "num_input_tokens_seen": 7316300, + "step": 345, + "time_per_iteration": 2.50992488861084 + }, + { + "auxiliary_loss_clip": 0.01312904, + "auxiliary_loss_mlp": 0.01064336, + "balance_loss_clip": 1.08687079, + "balance_loss_mlp": 1.04860044, + "epoch": 0.041604040161125476, + "flos": 14063481187200.0, + "grad_norm": 2.960064737292621, + "language_loss": 0.83638465, + "learning_rate": 3.9986021029832455e-06, + "loss": 0.86015701, + "num_input_tokens_seen": 7333615, + "step": 346, + "time_per_iteration": 2.4711108207702637 + }, + { + "auxiliary_loss_clip": 0.01312436, + "auxiliary_loss_mlp": 0.01055771, + "balance_loss_clip": 1.08320427, + "balance_loss_mlp": 1.03686416, + "epoch": 0.041724283051764566, + "flos": 12091877614080.0, + "grad_norm": 4.551994444278132, + "language_loss": 0.91755617, + "learning_rate": 3.9985728319293285e-06, + "loss": 0.94123816, + "num_input_tokens_seen": 7347590, + "step": 347, + "time_per_iteration": 2.4509387016296387 + }, + { + "auxiliary_loss_clip": 0.01317267, + "auxiliary_loss_mlp": 0.01054278, + "balance_loss_clip": 1.08412385, + "balance_loss_mlp": 1.03663468, + "epoch": 0.041844525942403656, + "flos": 12385303816320.0, + "grad_norm": 2.3224384490317864, + "language_loss": 0.8525244, + "learning_rate": 3.998543257693501e-06, + "loss": 0.8762399, + "num_input_tokens_seen": 7364345, + "step": 348, + "time_per_iteration": 2.4601216316223145 + }, + { + "auxiliary_loss_clip": 0.01314363, + "auxiliary_loss_mlp": 0.01065657, + "balance_loss_clip": 1.08635926, + "balance_loss_mlp": 1.0492177, + "epoch": 0.041964768833042745, + "flos": 23769345041280.0, + "grad_norm": 2.248021269233625, + "language_loss": 0.87798762, + "learning_rate": 3.998513380280251e-06, + "loss": 0.90178782, + "num_input_tokens_seen": 7384625, + "step": 349, + "time_per_iteration": 2.522496461868286 + }, + { + "auxiliary_loss_clip": 0.01317368, + "auxiliary_loss_mlp": 0.01069265, + "balance_loss_clip": 1.08583426, + "balance_loss_mlp": 1.05029845, + "epoch": 0.042085011723681835, + "flos": 11875336473600.0, + "grad_norm": 2.8138389743529975, + "language_loss": 0.95031047, + "learning_rate": 3.99848319969411e-06, + "loss": 0.97417688, + "num_input_tokens_seen": 7402225, + "step": 350, + "time_per_iteration": 2.458815097808838 + }, + { + "auxiliary_loss_clip": 0.01319429, + "auxiliary_loss_mlp": 0.01063121, + "balance_loss_clip": 1.08800197, + "balance_loss_mlp": 1.04487002, + "epoch": 0.042205254614320925, + "flos": 16873957964160.0, + "grad_norm": 2.282321927813428, + "language_loss": 0.79577434, + "learning_rate": 3.9984527159396564e-06, + "loss": 0.81959981, + "num_input_tokens_seen": 7420865, + "step": 351, + "time_per_iteration": 2.467747449874878 + }, + { + "auxiliary_loss_clip": 0.01313021, + "auxiliary_loss_mlp": 0.01055937, + "balance_loss_clip": 1.08246279, + "balance_loss_mlp": 1.03935456, + "epoch": 0.04232549750496002, + "flos": 25118508810240.0, + "grad_norm": 2.2321062629986983, + "language_loss": 0.84833091, + "learning_rate": 3.9984219290215154e-06, + "loss": 0.87202048, + "num_input_tokens_seen": 7441040, + "step": 352, + "time_per_iteration": 2.5033609867095947 + }, + { + "auxiliary_loss_clip": 0.01311988, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_clip": 1.08593845, + "balance_loss_mlp": 1.03308797, + "epoch": 0.04244574039559911, + "flos": 26724541714560.0, + "grad_norm": 3.782901108407086, + "language_loss": 0.89161533, + "learning_rate": 3.998390838944356e-06, + "loss": 0.91522217, + "num_input_tokens_seen": 7462545, + "step": 353, + "time_per_iteration": 2.5348618030548096 + }, + { + "auxiliary_loss_clip": 0.01313427, + "auxiliary_loss_mlp": 0.01060902, + "balance_loss_clip": 1.08533001, + "balance_loss_mlp": 1.04473722, + "epoch": 0.0425659832862382, + "flos": 20923244951040.0, + "grad_norm": 2.1331455820739187, + "language_loss": 0.90390348, + "learning_rate": 3.998359445712895e-06, + "loss": 0.92764676, + "num_input_tokens_seen": 7481650, + "step": 354, + "time_per_iteration": 2.5024843215942383 + }, + { + "auxiliary_loss_clip": 0.01310854, + "auxiliary_loss_mlp": 0.01050327, + "balance_loss_clip": 1.08206868, + "balance_loss_mlp": 1.03448427, + "epoch": 0.04268622617687729, + "flos": 23331127115520.0, + "grad_norm": 2.2933715559087076, + "language_loss": 0.81207919, + "learning_rate": 3.9983277493318955e-06, + "loss": 0.83569098, + "num_input_tokens_seen": 7500945, + "step": 355, + "time_per_iteration": 2.5418589115142822 + }, + { + "auxiliary_loss_clip": 0.01314101, + "auxiliary_loss_mlp": 0.01053424, + "balance_loss_clip": 1.08197987, + "balance_loss_mlp": 1.03690147, + "epoch": 0.04280646906751638, + "flos": 25994010908160.0, + "grad_norm": 1.6714723407154795, + "language_loss": 0.81337923, + "learning_rate": 3.998295749806165e-06, + "loss": 0.83705449, + "num_input_tokens_seen": 7522170, + "step": 356, + "time_per_iteration": 2.523563861846924 + }, + { + "auxiliary_loss_clip": 0.01314028, + "auxiliary_loss_mlp": 0.01068793, + "balance_loss_clip": 1.08721352, + "balance_loss_mlp": 1.05179369, + "epoch": 0.04292671195815547, + "flos": 26906824258560.0, + "grad_norm": 1.8169540640186919, + "language_loss": 0.83275414, + "learning_rate": 3.998263447140558e-06, + "loss": 0.85658234, + "num_input_tokens_seen": 7542370, + "step": 357, + "time_per_iteration": 2.538599729537964 + }, + { + "auxiliary_loss_clip": 0.01309809, + "auxiliary_loss_mlp": 0.0104728, + "balance_loss_clip": 1.08093023, + "balance_loss_mlp": 1.03097248, + "epoch": 0.04304695484879457, + "flos": 39457315745280.0, + "grad_norm": 1.753371143794966, + "language_loss": 0.81925076, + "learning_rate": 3.998230841339976e-06, + "loss": 0.84282172, + "num_input_tokens_seen": 7564380, + "step": 358, + "time_per_iteration": 2.637113332748413 + }, + { + "auxiliary_loss_clip": 0.01309956, + "auxiliary_loss_mlp": 0.01051402, + "balance_loss_clip": 1.08537078, + "balance_loss_mlp": 1.03527308, + "epoch": 0.04316719773943366, + "flos": 19646297475840.0, + "grad_norm": 2.2657939925781188, + "language_loss": 0.84906995, + "learning_rate": 3.998197932409363e-06, + "loss": 0.87268353, + "num_input_tokens_seen": 7582390, + "step": 359, + "time_per_iteration": 2.501049041748047 + }, + { + "auxiliary_loss_clip": 0.01303976, + "auxiliary_loss_mlp": 0.01058092, + "balance_loss_clip": 1.08091342, + "balance_loss_mlp": 1.04223669, + "epoch": 0.04328744063007275, + "flos": 22452320966400.0, + "grad_norm": 2.193100691262511, + "language_loss": 0.86434275, + "learning_rate": 3.9981647203537125e-06, + "loss": 0.88796341, + "num_input_tokens_seen": 7599890, + "step": 360, + "time_per_iteration": 2.4809176921844482 + }, + { + "auxiliary_loss_clip": 0.01307359, + "auxiliary_loss_mlp": 0.01062598, + "balance_loss_clip": 1.08094239, + "balance_loss_mlp": 1.04727912, + "epoch": 0.04340768352071184, + "flos": 21283033530240.0, + "grad_norm": 2.0729731843907726, + "language_loss": 0.96005911, + "learning_rate": 3.998131205178063e-06, + "loss": 0.98375863, + "num_input_tokens_seen": 7618360, + "step": 361, + "time_per_iteration": 2.465364694595337 + }, + { + "auxiliary_loss_clip": 0.01307294, + "auxiliary_loss_mlp": 0.01057581, + "balance_loss_clip": 1.08154821, + "balance_loss_mlp": 1.04179764, + "epoch": 0.04352792641135093, + "flos": 11583705951360.0, + "grad_norm": 4.862272840517238, + "language_loss": 0.76420879, + "learning_rate": 3.998097386887498e-06, + "loss": 0.78785759, + "num_input_tokens_seen": 7635435, + "step": 362, + "time_per_iteration": 2.4743475914001465 + }, + { + "auxiliary_loss_clip": 0.01304918, + "auxiliary_loss_mlp": 0.01067394, + "balance_loss_clip": 1.08116436, + "balance_loss_mlp": 1.05105066, + "epoch": 0.04364816930199002, + "flos": 23623547736960.0, + "grad_norm": 1.9716536780055414, + "language_loss": 0.85103494, + "learning_rate": 3.998063265487148e-06, + "loss": 0.87475806, + "num_input_tokens_seen": 7656485, + "step": 363, + "time_per_iteration": 2.5508499145507812 + }, + { + "auxiliary_loss_clip": 0.01308042, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_clip": 1.08364844, + "balance_loss_mlp": 1.04043388, + "epoch": 0.043768412192629114, + "flos": 14429734214400.0, + "grad_norm": 2.209335022683545, + "language_loss": 0.80973411, + "learning_rate": 3.99802884098219e-06, + "loss": 0.8333751, + "num_input_tokens_seen": 7674595, + "step": 364, + "time_per_iteration": 2.4609665870666504 + }, + { + "auxiliary_loss_clip": 0.01307313, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_clip": 1.08088434, + "balance_loss_mlp": 1.03095937, + "epoch": 0.043888655083268203, + "flos": 26468893641600.0, + "grad_norm": 2.1858082866538378, + "language_loss": 0.82345247, + "learning_rate": 3.997994113377845e-06, + "loss": 0.84700024, + "num_input_tokens_seen": 7693495, + "step": 365, + "time_per_iteration": 2.530149459838867 + }, + { + "auxiliary_loss_clip": 0.01306787, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.08145404, + "balance_loss_mlp": 1.03215146, + "epoch": 0.04400889797390729, + "flos": 27235263242880.0, + "grad_norm": 2.3087204289123235, + "language_loss": 0.83136582, + "learning_rate": 3.9979590826793815e-06, + "loss": 0.85492224, + "num_input_tokens_seen": 7714685, + "step": 366, + "time_per_iteration": 2.5117695331573486 + }, + { + "auxiliary_loss_clip": 0.01310481, + "auxiliary_loss_mlp": 0.01053061, + "balance_loss_clip": 1.08402312, + "balance_loss_mlp": 1.03628802, + "epoch": 0.04412914086454638, + "flos": 20119528183680.0, + "grad_norm": 2.37421501344759, + "language_loss": 0.80873638, + "learning_rate": 3.997923748892113e-06, + "loss": 0.83237183, + "num_input_tokens_seen": 7734005, + "step": 367, + "time_per_iteration": 3.252614974975586 + }, + { + "auxiliary_loss_clip": 0.01305054, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.08323932, + "balance_loss_mlp": 1.03655171, + "epoch": 0.04424938375518547, + "flos": 22604618632320.0, + "grad_norm": 2.060065623354843, + "language_loss": 0.88556302, + "learning_rate": 3.9978881120214015e-06, + "loss": 0.90912956, + "num_input_tokens_seen": 7755525, + "step": 368, + "time_per_iteration": 2.515429973602295 + }, + { + "auxiliary_loss_clip": 0.01307272, + "auxiliary_loss_mlp": 0.01050379, + "balance_loss_clip": 1.08134317, + "balance_loss_mlp": 1.03366542, + "epoch": 0.04436962664582456, + "flos": 24132365844480.0, + "grad_norm": 1.9910126897436693, + "language_loss": 0.79411018, + "learning_rate": 3.997852172072652e-06, + "loss": 0.81768668, + "num_input_tokens_seen": 7776740, + "step": 369, + "time_per_iteration": 2.5036027431488037 + }, + { + "auxiliary_loss_clip": 0.0130762, + "auxiliary_loss_mlp": 0.01061772, + "balance_loss_clip": 1.08186865, + "balance_loss_mlp": 1.04572678, + "epoch": 0.04448986953646366, + "flos": 18222906251520.0, + "grad_norm": 4.881557466520376, + "language_loss": 0.8943032, + "learning_rate": 3.9978159290513155e-06, + "loss": 0.91799712, + "num_input_tokens_seen": 7794820, + "step": 370, + "time_per_iteration": 3.3193178176879883 + }, + { + "auxiliary_loss_clip": 0.01307958, + "auxiliary_loss_mlp": 0.01065044, + "balance_loss_clip": 1.0822525, + "balance_loss_mlp": 1.04829526, + "epoch": 0.04461011242710275, + "flos": 30117920400000.0, + "grad_norm": 1.6630904649713403, + "language_loss": 0.80331135, + "learning_rate": 3.997779382962892e-06, + "loss": 0.82704139, + "num_input_tokens_seen": 7817705, + "step": 371, + "time_per_iteration": 2.547529458999634 + }, + { + "auxiliary_loss_clip": 0.0130169, + "auxiliary_loss_mlp": 0.01055319, + "balance_loss_clip": 1.0793817, + "balance_loss_mlp": 1.03841484, + "epoch": 0.04473035531774184, + "flos": 29752529299200.0, + "grad_norm": 2.1486411377727923, + "language_loss": 0.73549217, + "learning_rate": 3.997742533812924e-06, + "loss": 0.75906223, + "num_input_tokens_seen": 7840970, + "step": 372, + "time_per_iteration": 3.419358730316162 + }, + { + "auxiliary_loss_clip": 0.01307919, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_clip": 1.08419704, + "balance_loss_mlp": 1.05107129, + "epoch": 0.04485059820838093, + "flos": 13151565676800.0, + "grad_norm": 2.450927137963643, + "language_loss": 0.92658639, + "learning_rate": 3.997705381607001e-06, + "loss": 0.95033681, + "num_input_tokens_seen": 7857785, + "step": 373, + "time_per_iteration": 2.4556984901428223 + }, + { + "auxiliary_loss_clip": 0.01213861, + "auxiliary_loss_mlp": 0.01050382, + "balance_loss_clip": 1.06126595, + "balance_loss_mlp": 1.04227602, + "epoch": 0.04497084109902002, + "flos": 68094209548800.0, + "grad_norm": 0.9790704418458376, + "language_loss": 0.60295194, + "learning_rate": 3.997667926350761e-06, + "loss": 0.62559438, + "num_input_tokens_seen": 7916115, + "step": 374, + "time_per_iteration": 2.9758031368255615 + }, + { + "auxiliary_loss_clip": 0.01212173, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_clip": 1.06006777, + "balance_loss_mlp": 1.036129, + "epoch": 0.04509108398965911, + "flos": 64342263346560.0, + "grad_norm": 0.9080632105557515, + "language_loss": 0.57768965, + "learning_rate": 3.997630168049886e-06, + "loss": 0.60025764, + "num_input_tokens_seen": 7974480, + "step": 375, + "time_per_iteration": 3.054396629333496 + }, + { + "auxiliary_loss_clip": 0.01308536, + "auxiliary_loss_mlp": 0.01060521, + "balance_loss_clip": 1.08260226, + "balance_loss_mlp": 1.04392719, + "epoch": 0.045211326880298205, + "flos": 22271115830400.0, + "grad_norm": 1.8107792463460122, + "language_loss": 0.77470303, + "learning_rate": 3.997592106710101e-06, + "loss": 0.79839367, + "num_input_tokens_seen": 7993940, + "step": 376, + "time_per_iteration": 2.5062403678894043 + }, + { + "auxiliary_loss_clip": 0.01301868, + "auxiliary_loss_mlp": 0.01051322, + "balance_loss_clip": 1.08013248, + "balance_loss_mlp": 1.03538346, + "epoch": 0.045331569770937295, + "flos": 32159441796480.0, + "grad_norm": 2.8265780724162384, + "language_loss": 0.65737206, + "learning_rate": 3.997553742337182e-06, + "loss": 0.68090397, + "num_input_tokens_seen": 8013365, + "step": 377, + "time_per_iteration": 2.5672574043273926 + }, + { + "auxiliary_loss_clip": 0.01304414, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_clip": 1.08102703, + "balance_loss_mlp": 1.03939605, + "epoch": 0.045451812661576385, + "flos": 22163455791360.0, + "grad_norm": 1.8365341766960068, + "language_loss": 0.912081, + "learning_rate": 3.997515074936949e-06, + "loss": 0.93568218, + "num_input_tokens_seen": 8034240, + "step": 378, + "time_per_iteration": 2.5110716819763184 + }, + { + "auxiliary_loss_clip": 0.01304302, + "auxiliary_loss_mlp": 0.01059454, + "balance_loss_clip": 1.0807929, + "balance_loss_mlp": 1.0435034, + "epoch": 0.045572055552215475, + "flos": 16581968305920.0, + "grad_norm": 3.127611100388022, + "language_loss": 0.86878395, + "learning_rate": 3.997476104515268e-06, + "loss": 0.89242154, + "num_input_tokens_seen": 8052430, + "step": 379, + "time_per_iteration": 2.490489959716797 + }, + { + "auxiliary_loss_clip": 0.01303696, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_clip": 1.08355606, + "balance_loss_mlp": 1.04277122, + "epoch": 0.045692298442854565, + "flos": 17603375448960.0, + "grad_norm": 3.5862771205039743, + "language_loss": 0.77378345, + "learning_rate": 3.9974368310780485e-06, + "loss": 0.79740387, + "num_input_tokens_seen": 8069605, + "step": 380, + "time_per_iteration": 2.4650542736053467 + }, + { + "auxiliary_loss_clip": 0.01313976, + "auxiliary_loss_mlp": 0.01059389, + "balance_loss_clip": 1.0852859, + "balance_loss_mlp": 1.04187727, + "epoch": 0.045812541333493655, + "flos": 26761098781440.0, + "grad_norm": 2.969493865473316, + "language_loss": 0.74292934, + "learning_rate": 3.997397254631251e-06, + "loss": 0.76666296, + "num_input_tokens_seen": 8090225, + "step": 381, + "time_per_iteration": 2.5366361141204834 + }, + { + "auxiliary_loss_clip": 0.01201911, + "auxiliary_loss_mlp": 0.01041464, + "balance_loss_clip": 1.05471504, + "balance_loss_mlp": 1.03407311, + "epoch": 0.04593278422413275, + "flos": 60250349894400.0, + "grad_norm": 0.826749808815751, + "language_loss": 0.60024762, + "learning_rate": 3.997357375180878e-06, + "loss": 0.62268138, + "num_input_tokens_seen": 8154505, + "step": 382, + "time_per_iteration": 3.1870150566101074 + }, + { + "auxiliary_loss_clip": 0.01308093, + "auxiliary_loss_mlp": 0.01050148, + "balance_loss_clip": 1.08306384, + "balance_loss_mlp": 1.03308892, + "epoch": 0.04605302711477184, + "flos": 21799249839360.0, + "grad_norm": 1.9575818561244382, + "language_loss": 0.75227451, + "learning_rate": 3.997317192732979e-06, + "loss": 0.77585691, + "num_input_tokens_seen": 8173285, + "step": 383, + "time_per_iteration": 2.5041611194610596 + }, + { + "auxiliary_loss_clip": 0.01307153, + "auxiliary_loss_mlp": 0.01062596, + "balance_loss_clip": 1.08269322, + "balance_loss_mlp": 1.04579926, + "epoch": 0.04617327000541093, + "flos": 19459705299840.0, + "grad_norm": 15.082697588225773, + "language_loss": 0.82512033, + "learning_rate": 3.99727670729365e-06, + "loss": 0.84881777, + "num_input_tokens_seen": 8191845, + "step": 384, + "time_per_iteration": 2.483057975769043 + }, + { + "auxiliary_loss_clip": 0.01305237, + "auxiliary_loss_mlp": 0.01059073, + "balance_loss_clip": 1.08576119, + "balance_loss_mlp": 1.0434922, + "epoch": 0.04629351289605002, + "flos": 25411468135680.0, + "grad_norm": 1.8832234573273792, + "language_loss": 0.77701378, + "learning_rate": 3.997235918869033e-06, + "loss": 0.80065686, + "num_input_tokens_seen": 8212880, + "step": 385, + "time_per_iteration": 2.584770679473877 + }, + { + "auxiliary_loss_clip": 0.01307316, + "auxiliary_loss_mlp": 0.0104868, + "balance_loss_clip": 1.08620346, + "balance_loss_mlp": 1.03336191, + "epoch": 0.04641375578668911, + "flos": 20558284813440.0, + "grad_norm": 2.1588079040337074, + "language_loss": 0.82640243, + "learning_rate": 3.997194827465315e-06, + "loss": 0.84996235, + "num_input_tokens_seen": 8231475, + "step": 386, + "time_per_iteration": 2.5023181438446045 + }, + { + "auxiliary_loss_clip": 0.01305475, + "auxiliary_loss_mlp": 0.01048771, + "balance_loss_clip": 1.08197701, + "balance_loss_mlp": 1.03318989, + "epoch": 0.0465339986773282, + "flos": 13188661447680.0, + "grad_norm": 3.255443470714345, + "language_loss": 0.91170299, + "learning_rate": 3.997153433088728e-06, + "loss": 0.93524545, + "num_input_tokens_seen": 8248600, + "step": 387, + "time_per_iteration": 2.462891101837158 + }, + { + "auxiliary_loss_clip": 0.01306422, + "auxiliary_loss_mlp": 0.01054352, + "balance_loss_clip": 1.08443904, + "balance_loss_mlp": 1.03742421, + "epoch": 0.0466542415679673, + "flos": 25556547168000.0, + "grad_norm": 2.0165489000311903, + "language_loss": 0.8136313, + "learning_rate": 3.997111735745554e-06, + "loss": 0.83723903, + "num_input_tokens_seen": 8271570, + "step": 388, + "time_per_iteration": 2.5356483459472656 + }, + { + "auxiliary_loss_clip": 0.01302901, + "auxiliary_loss_mlp": 0.01060489, + "balance_loss_clip": 1.08219302, + "balance_loss_mlp": 1.0427146, + "epoch": 0.04677448445860639, + "flos": 22236749493120.0, + "grad_norm": 1.849563944640564, + "language_loss": 0.82464463, + "learning_rate": 3.997069735442118e-06, + "loss": 0.84827852, + "num_input_tokens_seen": 8291265, + "step": 389, + "time_per_iteration": 2.526449680328369 + }, + { + "auxiliary_loss_clip": 0.0130254, + "auxiliary_loss_mlp": 0.0105639, + "balance_loss_clip": 1.08192551, + "balance_loss_mlp": 1.04064286, + "epoch": 0.04689472734924548, + "flos": 28147825198080.0, + "grad_norm": 1.4598805329766036, + "language_loss": 0.80379522, + "learning_rate": 3.997027432184792e-06, + "loss": 0.82738447, + "num_input_tokens_seen": 8315925, + "step": 390, + "time_per_iteration": 2.6069769859313965 + }, + { + "auxiliary_loss_clip": 0.0130467, + "auxiliary_loss_mlp": 0.010543, + "balance_loss_clip": 1.08359206, + "balance_loss_mlp": 1.03882658, + "epoch": 0.04701497023988457, + "flos": 23148952312320.0, + "grad_norm": 2.013483234248443, + "language_loss": 0.89315629, + "learning_rate": 3.99698482597999e-06, + "loss": 0.91674602, + "num_input_tokens_seen": 8333605, + "step": 391, + "time_per_iteration": 2.4999606609344482 + }, + { + "auxiliary_loss_clip": 0.01194121, + "auxiliary_loss_mlp": 0.01014802, + "balance_loss_clip": 1.05286396, + "balance_loss_mlp": 1.00836456, + "epoch": 0.04713521313052366, + "flos": 64827668764800.0, + "grad_norm": 0.8743215866590295, + "language_loss": 0.63898301, + "learning_rate": 3.99694191683418e-06, + "loss": 0.66107219, + "num_input_tokens_seen": 8394405, + "step": 392, + "time_per_iteration": 3.077955484390259 + }, + { + "auxiliary_loss_clip": 0.0130794, + "auxiliary_loss_mlp": 0.01055877, + "balance_loss_clip": 1.08684731, + "balance_loss_mlp": 1.03896117, + "epoch": 0.047255456021162746, + "flos": 18771585477120.0, + "grad_norm": 1.9766125883336993, + "language_loss": 0.81709909, + "learning_rate": 3.996898704753867e-06, + "loss": 0.84073722, + "num_input_tokens_seen": 8412355, + "step": 393, + "time_per_iteration": 2.467668056488037 + }, + { + "auxiliary_loss_clip": 0.01301119, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.0808208, + "balance_loss_mlp": 1.03555799, + "epoch": 0.04737569891180184, + "flos": 22053820504320.0, + "grad_norm": 2.1282780018454623, + "language_loss": 0.87553906, + "learning_rate": 3.996855189745609e-06, + "loss": 0.89906299, + "num_input_tokens_seen": 8431620, + "step": 394, + "time_per_iteration": 3.3214595317840576 + }, + { + "auxiliary_loss_clip": 0.01301429, + "auxiliary_loss_mlp": 0.01058369, + "balance_loss_clip": 1.08062744, + "balance_loss_mlp": 1.04173899, + "epoch": 0.04749594180244093, + "flos": 29057370410880.0, + "grad_norm": 1.7742712768962188, + "language_loss": 0.92587304, + "learning_rate": 3.996811371816007e-06, + "loss": 0.949471, + "num_input_tokens_seen": 8454045, + "step": 395, + "time_per_iteration": 2.5340468883514404 + }, + { + "auxiliary_loss_clip": 0.01304723, + "auxiliary_loss_mlp": 0.01062, + "balance_loss_clip": 1.08485949, + "balance_loss_mlp": 1.04672945, + "epoch": 0.04761618469308002, + "flos": 35112268172160.0, + "grad_norm": 1.9140049090450078, + "language_loss": 0.78198045, + "learning_rate": 3.996767250971707e-06, + "loss": 0.80564767, + "num_input_tokens_seen": 8476785, + "step": 396, + "time_per_iteration": 2.604710340499878 + }, + { + "auxiliary_loss_clip": 0.01306915, + "auxiliary_loss_mlp": 0.01052324, + "balance_loss_clip": 1.08561552, + "balance_loss_mlp": 1.03587341, + "epoch": 0.04773642758371911, + "flos": 25630702796160.0, + "grad_norm": 1.8965995587752826, + "language_loss": 0.86975163, + "learning_rate": 3.996722827219403e-06, + "loss": 0.89334404, + "num_input_tokens_seen": 8498400, + "step": 397, + "time_per_iteration": 3.320405960083008 + }, + { + "auxiliary_loss_clip": 0.01310238, + "auxiliary_loss_mlp": 0.01061852, + "balance_loss_clip": 1.08772373, + "balance_loss_mlp": 1.0454371, + "epoch": 0.0478566704743582, + "flos": 20631506688000.0, + "grad_norm": 2.7339659543636303, + "language_loss": 0.82932961, + "learning_rate": 3.996678100565833e-06, + "loss": 0.85305053, + "num_input_tokens_seen": 8517455, + "step": 398, + "time_per_iteration": 2.5041348934173584 + }, + { + "auxiliary_loss_clip": 0.0129915, + "auxiliary_loss_mlp": 0.01055837, + "balance_loss_clip": 1.08027911, + "balance_loss_mlp": 1.03838503, + "epoch": 0.04797691336499729, + "flos": 18835721210880.0, + "grad_norm": 2.2288814645322828, + "language_loss": 0.88556743, + "learning_rate": 3.996633071017783e-06, + "loss": 0.90911722, + "num_input_tokens_seen": 8534085, + "step": 399, + "time_per_iteration": 4.04193902015686 + }, + { + "auxiliary_loss_clip": 0.01300727, + "auxiliary_loss_mlp": 0.01055607, + "balance_loss_clip": 1.08265805, + "balance_loss_mlp": 1.0391798, + "epoch": 0.04809715625563638, + "flos": 21099673578240.0, + "grad_norm": 2.366517592869167, + "language_loss": 0.81760651, + "learning_rate": 3.996587738582084e-06, + "loss": 0.84116983, + "num_input_tokens_seen": 8550885, + "step": 400, + "time_per_iteration": 2.4908688068389893 + }, + { + "auxiliary_loss_clip": 0.01298287, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_clip": 1.07882285, + "balance_loss_mlp": 1.03060579, + "epoch": 0.04821739914627548, + "flos": 23805650712960.0, + "grad_norm": 13.411460199123866, + "language_loss": 0.86008286, + "learning_rate": 3.9965421032656115e-06, + "loss": 0.88352466, + "num_input_tokens_seen": 8570815, + "step": 401, + "time_per_iteration": 2.5356838703155518 + }, + { + "auxiliary_loss_clip": 0.0130131, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_clip": 1.08075023, + "balance_loss_mlp": 1.03390598, + "epoch": 0.04833764203691457, + "flos": 22200587475840.0, + "grad_norm": 2.7015538418090874, + "language_loss": 0.94228053, + "learning_rate": 3.99649616507529e-06, + "loss": 0.96580803, + "num_input_tokens_seen": 8589910, + "step": 402, + "time_per_iteration": 2.515148639678955 + }, + { + "auxiliary_loss_clip": 0.01191461, + "auxiliary_loss_mlp": 0.01014412, + "balance_loss_clip": 1.05255461, + "balance_loss_mlp": 1.00852287, + "epoch": 0.04845788492755366, + "flos": 65904376896000.0, + "grad_norm": 0.8948925615385385, + "language_loss": 0.63133913, + "learning_rate": 3.996449924018088e-06, + "loss": 0.65339786, + "num_input_tokens_seen": 8650370, + "step": 403, + "time_per_iteration": 3.001668930053711 + }, + { + "auxiliary_loss_clip": 0.01297493, + "auxiliary_loss_mlp": 0.01055731, + "balance_loss_clip": 1.08124352, + "balance_loss_mlp": 1.04083014, + "epoch": 0.04857812781819275, + "flos": 19281301424640.0, + "grad_norm": 3.3128949042104976, + "language_loss": 0.79501402, + "learning_rate": 3.99640338010102e-06, + "loss": 0.81854618, + "num_input_tokens_seen": 8669475, + "step": 404, + "time_per_iteration": 2.5077438354492188 + }, + { + "auxiliary_loss_clip": 0.01296881, + "auxiliary_loss_mlp": 0.01050455, + "balance_loss_clip": 1.07876885, + "balance_loss_mlp": 1.03388453, + "epoch": 0.04869837070883184, + "flos": 24062376193920.0, + "grad_norm": 1.8264005464572708, + "language_loss": 0.78732586, + "learning_rate": 3.996356533331146e-06, + "loss": 0.81079918, + "num_input_tokens_seen": 8691345, + "step": 405, + "time_per_iteration": 2.5813164710998535 + }, + { + "auxiliary_loss_clip": 0.01309015, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.0822556, + "balance_loss_mlp": 1.03093433, + "epoch": 0.04881861359947093, + "flos": 25187169657600.0, + "grad_norm": 2.299877226344788, + "language_loss": 0.61913782, + "learning_rate": 3.996309383715573e-06, + "loss": 0.64269817, + "num_input_tokens_seen": 8710125, + "step": 406, + "time_per_iteration": 2.5506973266601562 + }, + { + "auxiliary_loss_clip": 0.01305136, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.08353484, + "balance_loss_mlp": 1.02932703, + "epoch": 0.048938856490110025, + "flos": 16362913213440.0, + "grad_norm": 4.542308600645671, + "language_loss": 0.73766047, + "learning_rate": 3.996261931261454e-06, + "loss": 0.76116097, + "num_input_tokens_seen": 8728705, + "step": 407, + "time_per_iteration": 2.4768905639648438 + }, + { + "auxiliary_loss_clip": 0.01301929, + "auxiliary_loss_mlp": 0.01050798, + "balance_loss_clip": 1.08295596, + "balance_loss_mlp": 1.03481197, + "epoch": 0.049059099380749115, + "flos": 29895094379520.0, + "grad_norm": 1.69931226147769, + "language_loss": 0.86397779, + "learning_rate": 3.996214175975987e-06, + "loss": 0.88750505, + "num_input_tokens_seen": 8749225, + "step": 408, + "time_per_iteration": 2.541062831878662 + }, + { + "auxiliary_loss_clip": 0.01305683, + "auxiliary_loss_mlp": 0.01056994, + "balance_loss_clip": 1.08446574, + "balance_loss_mlp": 1.04096055, + "epoch": 0.049179342271388204, + "flos": 35918858027520.0, + "grad_norm": 16.323599984219786, + "language_loss": 0.79003918, + "learning_rate": 3.996166117866417e-06, + "loss": 0.81366599, + "num_input_tokens_seen": 8771160, + "step": 409, + "time_per_iteration": 2.6045706272125244 + }, + { + "auxiliary_loss_clip": 0.01296232, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.07948112, + "balance_loss_mlp": 1.03385949, + "epoch": 0.049299585162027294, + "flos": 14611226659200.0, + "grad_norm": 1.9343237973452194, + "language_loss": 0.86551613, + "learning_rate": 3.996117756940035e-06, + "loss": 0.88897181, + "num_input_tokens_seen": 8787845, + "step": 410, + "time_per_iteration": 2.472079277038574 + }, + { + "auxiliary_loss_clip": 0.0130136, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.08308649, + "balance_loss_mlp": 1.03274441, + "epoch": 0.049419828052666384, + "flos": 19567939956480.0, + "grad_norm": 2.5200190517528114, + "language_loss": 0.97680199, + "learning_rate": 3.996069093204175e-06, + "loss": 1.00029612, + "num_input_tokens_seen": 8803805, + "step": 411, + "time_per_iteration": 2.5612008571624756 + }, + { + "auxiliary_loss_clip": 0.01307525, + "auxiliary_loss_mlp": 0.01053742, + "balance_loss_clip": 1.08575988, + "balance_loss_mlp": 1.0374223, + "epoch": 0.049540070943305474, + "flos": 13659916907520.0, + "grad_norm": 2.5097680591367753, + "language_loss": 0.88016677, + "learning_rate": 3.996020126666221e-06, + "loss": 0.90377945, + "num_input_tokens_seen": 8820785, + "step": 412, + "time_per_iteration": 2.5755655765533447 + }, + { + "auxiliary_loss_clip": 0.01299896, + "auxiliary_loss_mlp": 0.01047871, + "balance_loss_clip": 1.08220887, + "balance_loss_mlp": 1.03295815, + "epoch": 0.04966031383394457, + "flos": 21832035978240.0, + "grad_norm": 1.955007345608373, + "language_loss": 0.81952274, + "learning_rate": 3.995970857333601e-06, + "loss": 0.84300047, + "num_input_tokens_seen": 8841195, + "step": 413, + "time_per_iteration": 2.533642530441284 + }, + { + "auxiliary_loss_clip": 0.01301584, + "auxiliary_loss_mlp": 0.01051497, + "balance_loss_clip": 1.08071661, + "balance_loss_mlp": 1.03554714, + "epoch": 0.04978055672458366, + "flos": 28618793349120.0, + "grad_norm": 1.8510990305408832, + "language_loss": 0.79738045, + "learning_rate": 3.995921285213789e-06, + "loss": 0.82091129, + "num_input_tokens_seen": 8861455, + "step": 414, + "time_per_iteration": 2.533730983734131 + }, + { + "auxiliary_loss_clip": 0.01296805, + "auxiliary_loss_mlp": 0.0104877, + "balance_loss_clip": 1.08063483, + "balance_loss_mlp": 1.034024, + "epoch": 0.04990079961522275, + "flos": 19828220883840.0, + "grad_norm": 2.488698298117378, + "language_loss": 0.80567664, + "learning_rate": 3.995871410314305e-06, + "loss": 0.82913238, + "num_input_tokens_seen": 8880015, + "step": 415, + "time_per_iteration": 2.4749958515167236 + }, + { + "auxiliary_loss_clip": 0.01173741, + "auxiliary_loss_mlp": 0.01007018, + "balance_loss_clip": 1.04972339, + "balance_loss_mlp": 1.0009625, + "epoch": 0.05002104250586184, + "flos": 62735045293440.0, + "grad_norm": 0.9054442068518652, + "language_loss": 0.59635961, + "learning_rate": 3.995821232642714e-06, + "loss": 0.61816722, + "num_input_tokens_seen": 8938420, + "step": 416, + "time_per_iteration": 3.158643960952759 + }, + { + "auxiliary_loss_clip": 0.01281183, + "auxiliary_loss_mlp": 0.01051766, + "balance_loss_clip": 1.08201373, + "balance_loss_mlp": 1.03673339, + "epoch": 0.05014128539650093, + "flos": 27928518710400.0, + "grad_norm": 2.1831508310459284, + "language_loss": 0.82651401, + "learning_rate": 3.995770752206629e-06, + "loss": 0.8498435, + "num_input_tokens_seen": 8959495, + "step": 417, + "time_per_iteration": 2.61844539642334 + }, + { + "auxiliary_loss_clip": 0.01300244, + "auxiliary_loss_mlp": 0.01045056, + "balance_loss_clip": 1.08212149, + "balance_loss_mlp": 1.02893901, + "epoch": 0.05026152828714002, + "flos": 17705576620800.0, + "grad_norm": 2.3843534208319035, + "language_loss": 0.9731673, + "learning_rate": 3.995719969013709e-06, + "loss": 0.99662036, + "num_input_tokens_seen": 8976675, + "step": 418, + "time_per_iteration": 2.510394811630249 + }, + { + "auxiliary_loss_clip": 0.01264372, + "auxiliary_loss_mlp": 0.010525, + "balance_loss_clip": 1.07839465, + "balance_loss_mlp": 1.0364666, + "epoch": 0.05038177117777912, + "flos": 19133277477120.0, + "grad_norm": 3.010087020556891, + "language_loss": 0.85372448, + "learning_rate": 3.995668883071655e-06, + "loss": 0.87689316, + "num_input_tokens_seen": 8992900, + "step": 419, + "time_per_iteration": 2.5459470748901367 + }, + { + "auxiliary_loss_clip": 0.01300471, + "auxiliary_loss_mlp": 0.01051948, + "balance_loss_clip": 1.08223248, + "balance_loss_mlp": 1.03637969, + "epoch": 0.050502014068418206, + "flos": 20667704618880.0, + "grad_norm": 2.3014059850181887, + "language_loss": 0.9099651, + "learning_rate": 3.995617494388219e-06, + "loss": 0.93348932, + "num_input_tokens_seen": 9011020, + "step": 420, + "time_per_iteration": 2.4808101654052734 + }, + { + "auxiliary_loss_clip": 0.01261402, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.07446694, + "balance_loss_mlp": 1.02864623, + "epoch": 0.050622256959057296, + "flos": 21361103740800.0, + "grad_norm": 1.989831026765851, + "language_loss": 0.80376565, + "learning_rate": 3.995565802971196e-06, + "loss": 0.82682347, + "num_input_tokens_seen": 9030995, + "step": 421, + "time_per_iteration": 3.3927552700042725 + }, + { + "auxiliary_loss_clip": 0.01258431, + "auxiliary_loss_mlp": 0.01052933, + "balance_loss_clip": 1.07435346, + "balance_loss_mlp": 1.03863978, + "epoch": 0.050742499849696386, + "flos": 27673588909440.0, + "grad_norm": 1.925790732415257, + "language_loss": 0.67646933, + "learning_rate": 3.995513808828427e-06, + "loss": 0.69958293, + "num_input_tokens_seen": 9053790, + "step": 422, + "time_per_iteration": 2.646677255630493 + }, + { + "auxiliary_loss_clip": 0.01260708, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.07567978, + "balance_loss_mlp": 1.03291273, + "epoch": 0.050862742740335476, + "flos": 19865999013120.0, + "grad_norm": 1.8116825290558, + "language_loss": 0.76393795, + "learning_rate": 3.9954615119678e-06, + "loss": 0.78702748, + "num_input_tokens_seen": 9072345, + "step": 423, + "time_per_iteration": 2.561293363571167 + }, + { + "auxiliary_loss_clip": 0.01269033, + "auxiliary_loss_mlp": 0.01056699, + "balance_loss_clip": 1.07558346, + "balance_loss_mlp": 1.04059434, + "epoch": 0.050982985630974566, + "flos": 22085098272000.0, + "grad_norm": 2.03521907066593, + "language_loss": 0.80695105, + "learning_rate": 3.995408912397248e-06, + "loss": 0.83020842, + "num_input_tokens_seen": 9090240, + "step": 424, + "time_per_iteration": 2.5259456634521484 + }, + { + "auxiliary_loss_clip": 0.01266463, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_clip": 1.07880783, + "balance_loss_mlp": 1.03574848, + "epoch": 0.05110322852161366, + "flos": 20740962407040.0, + "grad_norm": 2.153202176186237, + "language_loss": 0.93311572, + "learning_rate": 3.99535601012475e-06, + "loss": 0.95629811, + "num_input_tokens_seen": 9105570, + "step": 425, + "time_per_iteration": 4.150558233261108 + }, + { + "auxiliary_loss_clip": 0.01245075, + "auxiliary_loss_mlp": 0.00766993, + "balance_loss_clip": 1.07676184, + "balance_loss_mlp": 1.00095129, + "epoch": 0.05122347141225275, + "flos": 28547295327360.0, + "grad_norm": 1.7456692394336903, + "language_loss": 0.75426227, + "learning_rate": 3.995302805158333e-06, + "loss": 0.77438295, + "num_input_tokens_seen": 9128225, + "step": 426, + "time_per_iteration": 2.6686456203460693 + }, + { + "auxiliary_loss_clip": 0.01255958, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.07547879, + "balance_loss_mlp": 1.03403497, + "epoch": 0.05134371430289184, + "flos": 19722679747200.0, + "grad_norm": 1.952348852847772, + "language_loss": 0.83475518, + "learning_rate": 3.9952492975060665e-06, + "loss": 0.85782886, + "num_input_tokens_seen": 9148295, + "step": 427, + "time_per_iteration": 3.3942737579345703 + }, + { + "auxiliary_loss_clip": 0.01276961, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.07808042, + "balance_loss_mlp": 1.02670765, + "epoch": 0.05146395719353093, + "flos": 34458945649920.0, + "grad_norm": 2.593341049968763, + "language_loss": 0.84843683, + "learning_rate": 3.995195487176067e-06, + "loss": 0.87162125, + "num_input_tokens_seen": 9168525, + "step": 428, + "time_per_iteration": 2.6617722511291504 + }, + { + "auxiliary_loss_clip": 0.01295977, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_clip": 1.07999504, + "balance_loss_mlp": 1.03482068, + "epoch": 0.05158420008417002, + "flos": 21760286561280.0, + "grad_norm": 1.9204342854519867, + "language_loss": 0.85410869, + "learning_rate": 3.995141374176499e-06, + "loss": 0.87757009, + "num_input_tokens_seen": 9186920, + "step": 429, + "time_per_iteration": 2.5337750911712646 + }, + { + "auxiliary_loss_clip": 0.01143837, + "auxiliary_loss_mlp": 0.00756563, + "balance_loss_clip": 1.04659235, + "balance_loss_mlp": 1.00036418, + "epoch": 0.05170444297480911, + "flos": 72553956226560.0, + "grad_norm": 0.8703592539281128, + "language_loss": 0.63139963, + "learning_rate": 3.995086958515572e-06, + "loss": 0.65040362, + "num_input_tokens_seen": 9244940, + "step": 430, + "time_per_iteration": 3.196502208709717 + }, + { + "auxiliary_loss_clip": 0.0118113, + "auxiliary_loss_mlp": 0.007568, + "balance_loss_clip": 1.04612827, + "balance_loss_mlp": 1.00034904, + "epoch": 0.05182468586544821, + "flos": 62416159326720.0, + "grad_norm": 0.8555098447788403, + "language_loss": 0.59934545, + "learning_rate": 3.995032240201538e-06, + "loss": 0.61872476, + "num_input_tokens_seen": 9307335, + "step": 431, + "time_per_iteration": 3.0434532165527344 + }, + { + "auxiliary_loss_clip": 0.01153987, + "auxiliary_loss_mlp": 0.01007454, + "balance_loss_clip": 1.03965926, + "balance_loss_mlp": 1.00194609, + "epoch": 0.0519449287560873, + "flos": 41225989432320.0, + "grad_norm": 0.9454745863448065, + "language_loss": 0.63146764, + "learning_rate": 3.9949772192427e-06, + "loss": 0.65308207, + "num_input_tokens_seen": 9353960, + "step": 432, + "time_per_iteration": 2.7855050563812256 + }, + { + "auxiliary_loss_clip": 0.01259463, + "auxiliary_loss_mlp": 0.01047102, + "balance_loss_clip": 1.07326114, + "balance_loss_mlp": 1.03160453, + "epoch": 0.05206517164672639, + "flos": 17494530261120.0, + "grad_norm": 1.966456443919025, + "language_loss": 0.79672104, + "learning_rate": 3.994921895647405e-06, + "loss": 0.81978667, + "num_input_tokens_seen": 9372130, + "step": 433, + "time_per_iteration": 2.5267980098724365 + }, + { + "auxiliary_loss_clip": 0.01176159, + "auxiliary_loss_mlp": 0.01007939, + "balance_loss_clip": 1.0428772, + "balance_loss_mlp": 1.00245512, + "epoch": 0.05218541453736548, + "flos": 64002762973440.0, + "grad_norm": 0.8373059307137443, + "language_loss": 0.55360562, + "learning_rate": 3.994866269424043e-06, + "loss": 0.57544661, + "num_input_tokens_seen": 9428500, + "step": 434, + "time_per_iteration": 2.9598982334136963 + }, + { + "auxiliary_loss_clip": 0.01201427, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_clip": 1.05995917, + "balance_loss_mlp": 1.0375762, + "epoch": 0.05230565742800457, + "flos": 19317319787520.0, + "grad_norm": 2.507600517771012, + "language_loss": 0.78426898, + "learning_rate": 3.9948103405810545e-06, + "loss": 0.80681527, + "num_input_tokens_seen": 9447450, + "step": 435, + "time_per_iteration": 2.6395328044891357 + }, + { + "auxiliary_loss_clip": 0.01230608, + "auxiliary_loss_mlp": 0.01053478, + "balance_loss_clip": 1.06978583, + "balance_loss_mlp": 1.03942299, + "epoch": 0.05242590031864366, + "flos": 25298636538240.0, + "grad_norm": 4.149633701889057, + "language_loss": 0.86005664, + "learning_rate": 3.994754109126923e-06, + "loss": 0.88289744, + "num_input_tokens_seen": 9468945, + "step": 436, + "time_per_iteration": 2.645141839981079 + }, + { + "auxiliary_loss_clip": 0.0120372, + "auxiliary_loss_mlp": 0.01043157, + "balance_loss_clip": 1.06939352, + "balance_loss_mlp": 1.02864957, + "epoch": 0.052546143209282754, + "flos": 26211629456640.0, + "grad_norm": 1.6682583084824458, + "language_loss": 0.93285549, + "learning_rate": 3.994697575070181e-06, + "loss": 0.95532429, + "num_input_tokens_seen": 9488405, + "step": 437, + "time_per_iteration": 2.8471429347991943 + }, + { + "auxiliary_loss_clip": 0.01259901, + "auxiliary_loss_mlp": 0.01054144, + "balance_loss_clip": 1.07766271, + "balance_loss_mlp": 1.03888524, + "epoch": 0.052666386099921844, + "flos": 22158140578560.0, + "grad_norm": 1.7520408156355585, + "language_loss": 0.91556644, + "learning_rate": 3.994640738419402e-06, + "loss": 0.93870687, + "num_input_tokens_seen": 9507780, + "step": 438, + "time_per_iteration": 2.873599052429199 + }, + { + "auxiliary_loss_clip": 0.01274003, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.07774472, + "balance_loss_mlp": 1.02959228, + "epoch": 0.052786628990560934, + "flos": 23881817502720.0, + "grad_norm": 2.007857097295353, + "language_loss": 0.80960095, + "learning_rate": 3.9945835991832075e-06, + "loss": 0.83277953, + "num_input_tokens_seen": 9529665, + "step": 439, + "time_per_iteration": 2.573204517364502 + }, + { + "auxiliary_loss_clip": 0.01294579, + "auxiliary_loss_mlp": 0.01057563, + "balance_loss_clip": 1.08350658, + "balance_loss_mlp": 1.04297209, + "epoch": 0.052906871881200024, + "flos": 24605021934720.0, + "grad_norm": 2.7126775079446124, + "language_loss": 0.92883289, + "learning_rate": 3.994526157370268e-06, + "loss": 0.95235425, + "num_input_tokens_seen": 9548280, + "step": 440, + "time_per_iteration": 2.5361196994781494 + }, + { + "auxiliary_loss_clip": 0.01149993, + "auxiliary_loss_mlp": 0.01006307, + "balance_loss_clip": 1.03666568, + "balance_loss_mlp": 1.00089502, + "epoch": 0.053027114771839114, + "flos": 56461631143680.0, + "grad_norm": 0.8956927470746533, + "language_loss": 0.59263247, + "learning_rate": 3.994468412989296e-06, + "loss": 0.61419547, + "num_input_tokens_seen": 9609690, + "step": 441, + "time_per_iteration": 3.219245672225952 + }, + { + "auxiliary_loss_clip": 0.01233992, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.06908166, + "balance_loss_mlp": 1.03502071, + "epoch": 0.053147357662478203, + "flos": 17311098481920.0, + "grad_norm": 2.0762146149015854, + "language_loss": 0.92690444, + "learning_rate": 3.994410366049052e-06, + "loss": 0.94974178, + "num_input_tokens_seen": 9627550, + "step": 442, + "time_per_iteration": 2.536608934402466 + }, + { + "auxiliary_loss_clip": 0.01272897, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.07576668, + "balance_loss_mlp": 1.02873874, + "epoch": 0.0532676005531173, + "flos": 17164977955200.0, + "grad_norm": 2.1637109291866286, + "language_loss": 0.83134085, + "learning_rate": 3.994352016558341e-06, + "loss": 0.8545059, + "num_input_tokens_seen": 9644855, + "step": 443, + "time_per_iteration": 2.4811205863952637 + }, + { + "auxiliary_loss_clip": 0.01274636, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.07829463, + "balance_loss_mlp": 1.03706574, + "epoch": 0.05338784344375639, + "flos": 27819960831360.0, + "grad_norm": 1.9838096726617758, + "language_loss": 0.73960245, + "learning_rate": 3.994293364526014e-06, + "loss": 0.76286602, + "num_input_tokens_seen": 9665740, + "step": 444, + "time_per_iteration": 2.6801724433898926 + }, + { + "auxiliary_loss_clip": 0.01249816, + "auxiliary_loss_mlp": 0.01047337, + "balance_loss_clip": 1.07561362, + "balance_loss_mlp": 1.03107703, + "epoch": 0.05350808633439548, + "flos": 21507691144320.0, + "grad_norm": 2.3305621249999358, + "language_loss": 0.84785217, + "learning_rate": 3.99423440996097e-06, + "loss": 0.87082368, + "num_input_tokens_seen": 9685280, + "step": 445, + "time_per_iteration": 2.5926566123962402 + }, + { + "auxiliary_loss_clip": 0.01259055, + "auxiliary_loss_mlp": 0.01051048, + "balance_loss_clip": 1.07913065, + "balance_loss_mlp": 1.03587234, + "epoch": 0.05362832922503457, + "flos": 20084299920000.0, + "grad_norm": 6.29055922851008, + "language_loss": 0.81651968, + "learning_rate": 3.994175152872152e-06, + "loss": 0.83962071, + "num_input_tokens_seen": 9704365, + "step": 446, + "time_per_iteration": 2.569978713989258 + }, + { + "auxiliary_loss_clip": 0.01275756, + "auxiliary_loss_mlp": 0.01041242, + "balance_loss_clip": 1.07585716, + "balance_loss_mlp": 1.02719927, + "epoch": 0.05374857211567366, + "flos": 26137222433280.0, + "grad_norm": 2.196225724100749, + "language_loss": 0.79002571, + "learning_rate": 3.994115593268548e-06, + "loss": 0.81319571, + "num_input_tokens_seen": 9724145, + "step": 447, + "time_per_iteration": 3.436283826828003 + }, + { + "auxiliary_loss_clip": 0.01291874, + "auxiliary_loss_mlp": 0.01053269, + "balance_loss_clip": 1.07938385, + "balance_loss_mlp": 1.03897595, + "epoch": 0.05386881500631275, + "flos": 27486817165440.0, + "grad_norm": 1.9615877816766543, + "language_loss": 0.82201803, + "learning_rate": 3.994055731159195e-06, + "loss": 0.84546947, + "num_input_tokens_seen": 9741615, + "step": 448, + "time_per_iteration": 2.530015230178833 + }, + { + "auxiliary_loss_clip": 0.01277408, + "auxiliary_loss_mlp": 0.01060105, + "balance_loss_clip": 1.08094954, + "balance_loss_mlp": 1.0459249, + "epoch": 0.053989057896951846, + "flos": 23585087249280.0, + "grad_norm": 1.8797501118212996, + "language_loss": 0.8712545, + "learning_rate": 3.993995566553172e-06, + "loss": 0.8946296, + "num_input_tokens_seen": 9760580, + "step": 449, + "time_per_iteration": 2.6010842323303223 + }, + { + "auxiliary_loss_clip": 0.01235954, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_clip": 1.06595063, + "balance_loss_mlp": 1.03150511, + "epoch": 0.054109300787590936, + "flos": 25228862369280.0, + "grad_norm": 1.6976254695350672, + "language_loss": 0.772946, + "learning_rate": 3.993935099459607e-06, + "loss": 0.79576695, + "num_input_tokens_seen": 9782195, + "step": 450, + "time_per_iteration": 2.612295389175415 + }, + { + "auxiliary_loss_clip": 0.01284598, + "auxiliary_loss_mlp": 0.01048673, + "balance_loss_clip": 1.07867634, + "balance_loss_mlp": 1.03505337, + "epoch": 0.054229543678230026, + "flos": 23841525421440.0, + "grad_norm": 2.1527376349446117, + "language_loss": 0.74068034, + "learning_rate": 3.993874329887673e-06, + "loss": 0.76401305, + "num_input_tokens_seen": 9800850, + "step": 451, + "time_per_iteration": 2.5610320568084717 + }, + { + "auxiliary_loss_clip": 0.01275216, + "auxiliary_loss_mlp": 0.01055472, + "balance_loss_clip": 1.07793713, + "balance_loss_mlp": 1.04037964, + "epoch": 0.054349786568869116, + "flos": 16320933192960.0, + "grad_norm": 2.43952839340982, + "language_loss": 0.86216784, + "learning_rate": 3.993813257846589e-06, + "loss": 0.88547456, + "num_input_tokens_seen": 9817605, + "step": 452, + "time_per_iteration": 3.3321056365966797 + }, + { + "auxiliary_loss_clip": 0.01273227, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_clip": 1.07825232, + "balance_loss_mlp": 1.03110576, + "epoch": 0.054470029459508205, + "flos": 18660729127680.0, + "grad_norm": 2.4540345738589604, + "language_loss": 0.92340803, + "learning_rate": 3.993751883345619e-06, + "loss": 0.94659734, + "num_input_tokens_seen": 9835965, + "step": 453, + "time_per_iteration": 3.3967556953430176 + }, + { + "auxiliary_loss_clip": 0.01253371, + "auxiliary_loss_mlp": 0.0105064, + "balance_loss_clip": 1.0768224, + "balance_loss_mlp": 1.03570938, + "epoch": 0.054590272350147295, + "flos": 17785298856960.0, + "grad_norm": 2.4492687650873255, + "language_loss": 0.87209421, + "learning_rate": 3.993690206394073e-06, + "loss": 0.89513433, + "num_input_tokens_seen": 9852265, + "step": 454, + "time_per_iteration": 3.352182626724243 + }, + { + "auxiliary_loss_clip": 0.01260155, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_clip": 1.07582223, + "balance_loss_mlp": 1.03435254, + "epoch": 0.054710515240786385, + "flos": 17785945301760.0, + "grad_norm": 2.0167613805190956, + "language_loss": 0.87748712, + "learning_rate": 3.993628227001307e-06, + "loss": 0.90057659, + "num_input_tokens_seen": 9870465, + "step": 455, + "time_per_iteration": 2.5466055870056152 + }, + { + "auxiliary_loss_clip": 0.01255651, + "auxiliary_loss_mlp": 0.01054282, + "balance_loss_clip": 1.07518387, + "balance_loss_mlp": 1.04006076, + "epoch": 0.05483075813142548, + "flos": 48210900180480.0, + "grad_norm": 1.8777234743699365, + "language_loss": 0.71359921, + "learning_rate": 3.993565945176726e-06, + "loss": 0.73669851, + "num_input_tokens_seen": 9891490, + "step": 456, + "time_per_iteration": 2.798192262649536 + }, + { + "auxiliary_loss_clip": 0.01247937, + "auxiliary_loss_mlp": 0.01047025, + "balance_loss_clip": 1.0742749, + "balance_loss_mlp": 1.03255939, + "epoch": 0.05495100102206457, + "flos": 19682244011520.0, + "grad_norm": 1.958825381402083, + "language_loss": 0.84441829, + "learning_rate": 3.993503360929776e-06, + "loss": 0.86736798, + "num_input_tokens_seen": 9910375, + "step": 457, + "time_per_iteration": 2.5571701526641846 + }, + { + "auxiliary_loss_clip": 0.01186518, + "auxiliary_loss_mlp": 0.01048125, + "balance_loss_clip": 1.06417263, + "balance_loss_mlp": 1.03300881, + "epoch": 0.05507124391270366, + "flos": 26360048453760.0, + "grad_norm": 1.7567421591039132, + "language_loss": 0.81142128, + "learning_rate": 3.99344047426995e-06, + "loss": 0.83376771, + "num_input_tokens_seen": 9931635, + "step": 458, + "time_per_iteration": 2.8402340412139893 + }, + { + "auxiliary_loss_clip": 0.01225332, + "auxiliary_loss_mlp": 0.01049963, + "balance_loss_clip": 1.06929886, + "balance_loss_mlp": 1.03465652, + "epoch": 0.05519148680334275, + "flos": 22601314581120.0, + "grad_norm": 2.104397228739993, + "language_loss": 0.93625522, + "learning_rate": 3.993377285206789e-06, + "loss": 0.95900822, + "num_input_tokens_seen": 9951420, + "step": 459, + "time_per_iteration": 2.9285833835601807 + }, + { + "auxiliary_loss_clip": 0.01216721, + "auxiliary_loss_mlp": 0.01056765, + "balance_loss_clip": 1.06795645, + "balance_loss_mlp": 1.04189968, + "epoch": 0.05531172969398184, + "flos": 40552519380480.0, + "grad_norm": 1.7810400891534426, + "language_loss": 0.86242628, + "learning_rate": 3.99331379374988e-06, + "loss": 0.8851611, + "num_input_tokens_seen": 9975025, + "step": 460, + "time_per_iteration": 2.7733118534088135 + }, + { + "auxiliary_loss_clip": 0.01260238, + "auxiliary_loss_mlp": 0.01044803, + "balance_loss_clip": 1.07015657, + "balance_loss_mlp": 1.03131461, + "epoch": 0.05543197258462093, + "flos": 23477894087040.0, + "grad_norm": 2.0558160232231293, + "language_loss": 0.80142117, + "learning_rate": 3.993249999908852e-06, + "loss": 0.82447153, + "num_input_tokens_seen": 9995175, + "step": 461, + "time_per_iteration": 2.61435604095459 + }, + { + "auxiliary_loss_clip": 0.01287158, + "auxiliary_loss_mlp": 0.01047705, + "balance_loss_clip": 1.07595766, + "balance_loss_mlp": 1.03390098, + "epoch": 0.05555221547526003, + "flos": 18624603024000.0, + "grad_norm": 2.0611482450465903, + "language_loss": 0.87428278, + "learning_rate": 3.993185903693384e-06, + "loss": 0.89763141, + "num_input_tokens_seen": 10011975, + "step": 462, + "time_per_iteration": 2.4678704738616943 + }, + { + "auxiliary_loss_clip": 0.0125292, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.07375169, + "balance_loss_mlp": 1.02593231, + "epoch": 0.05567245836589912, + "flos": 23587098410880.0, + "grad_norm": 2.2576931871790533, + "language_loss": 0.82325077, + "learning_rate": 3.9931215051131995e-06, + "loss": 0.84617591, + "num_input_tokens_seen": 10032620, + "step": 463, + "time_per_iteration": 2.614413022994995 + }, + { + "auxiliary_loss_clip": 0.01256273, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.06942487, + "balance_loss_mlp": 1.03173161, + "epoch": 0.05579270125653821, + "flos": 27746667129600.0, + "grad_norm": 1.5333354564677812, + "language_loss": 0.80077308, + "learning_rate": 3.993056804178068e-06, + "loss": 0.82379591, + "num_input_tokens_seen": 10054165, + "step": 464, + "time_per_iteration": 2.625532865524292 + }, + { + "auxiliary_loss_clip": 0.01215156, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_clip": 1.06792319, + "balance_loss_mlp": 1.03006136, + "epoch": 0.0559129441471773, + "flos": 27014161075200.0, + "grad_norm": 2.051909912610453, + "language_loss": 0.84421498, + "learning_rate": 3.992991800897803e-06, + "loss": 0.86681676, + "num_input_tokens_seen": 10073970, + "step": 465, + "time_per_iteration": 2.793992757797241 + }, + { + "auxiliary_loss_clip": 0.01286037, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.07677484, + "balance_loss_mlp": 1.03202069, + "epoch": 0.05603318703781639, + "flos": 15229787794560.0, + "grad_norm": 2.3041596877673123, + "language_loss": 0.89988774, + "learning_rate": 3.9929264952822665e-06, + "loss": 0.92322022, + "num_input_tokens_seen": 10091505, + "step": 466, + "time_per_iteration": 2.5002784729003906 + }, + { + "auxiliary_loss_clip": 0.0127289, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.07353401, + "balance_loss_mlp": 1.03664696, + "epoch": 0.05615342992845548, + "flos": 22266482976000.0, + "grad_norm": 2.3220236947354196, + "language_loss": 0.88353026, + "learning_rate": 3.992860887341366e-06, + "loss": 0.90676868, + "num_input_tokens_seen": 10109675, + "step": 467, + "time_per_iteration": 2.573352575302124 + }, + { + "auxiliary_loss_clip": 0.01225384, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.06847656, + "balance_loss_mlp": 1.02748108, + "epoch": 0.056273672819094574, + "flos": 23584979508480.0, + "grad_norm": 2.2303254393803043, + "language_loss": 0.81188333, + "learning_rate": 3.992794977085052e-06, + "loss": 0.83456481, + "num_input_tokens_seen": 10127675, + "step": 468, + "time_per_iteration": 2.624342679977417 + }, + { + "auxiliary_loss_clip": 0.01241007, + "auxiliary_loss_mlp": 0.01053444, + "balance_loss_clip": 1.0728451, + "balance_loss_mlp": 1.03932929, + "epoch": 0.056393915709733664, + "flos": 19858708552320.0, + "grad_norm": 2.1551598959605265, + "language_loss": 0.84982693, + "learning_rate": 3.992728764523326e-06, + "loss": 0.87277138, + "num_input_tokens_seen": 10146620, + "step": 469, + "time_per_iteration": 2.6475706100463867 + }, + { + "auxiliary_loss_clip": 0.01254025, + "auxiliary_loss_mlp": 0.01047833, + "balance_loss_clip": 1.07240641, + "balance_loss_mlp": 1.03345656, + "epoch": 0.05651415860037275, + "flos": 22163779013760.0, + "grad_norm": 2.1054534597155476, + "language_loss": 0.80992544, + "learning_rate": 3.99266224966623e-06, + "loss": 0.8329441, + "num_input_tokens_seen": 10167535, + "step": 470, + "time_per_iteration": 2.619994640350342 + }, + { + "auxiliary_loss_clip": 0.01244018, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.07288432, + "balance_loss_mlp": 1.03129482, + "epoch": 0.05663440149101184, + "flos": 19463548055040.0, + "grad_norm": 1.9288716141113582, + "language_loss": 0.87809944, + "learning_rate": 3.992595432523855e-06, + "loss": 0.90100068, + "num_input_tokens_seen": 10184825, + "step": 471, + "time_per_iteration": 2.569845676422119 + }, + { + "auxiliary_loss_clip": 0.01227736, + "auxiliary_loss_mlp": 0.01050565, + "balance_loss_clip": 1.06888604, + "balance_loss_mlp": 1.03649819, + "epoch": 0.05675464438165093, + "flos": 22670226823680.0, + "grad_norm": 1.9535716613615486, + "language_loss": 0.86224347, + "learning_rate": 3.992528313106338e-06, + "loss": 0.88502645, + "num_input_tokens_seen": 10203025, + "step": 472, + "time_per_iteration": 2.633270502090454 + }, + { + "auxiliary_loss_clip": 0.0128748, + "auxiliary_loss_mlp": 0.00766872, + "balance_loss_clip": 1.08005643, + "balance_loss_mlp": 1.00075042, + "epoch": 0.05687488727229002, + "flos": 16901177495040.0, + "grad_norm": 2.305118764708682, + "language_loss": 0.81970888, + "learning_rate": 3.9924608914238595e-06, + "loss": 0.8402524, + "num_input_tokens_seen": 10218020, + "step": 473, + "time_per_iteration": 2.520453691482544 + }, + { + "auxiliary_loss_clip": 0.01270703, + "auxiliary_loss_mlp": 0.01049679, + "balance_loss_clip": 1.07682896, + "balance_loss_mlp": 1.03543305, + "epoch": 0.05699513016292912, + "flos": 29168980945920.0, + "grad_norm": 2.2892678541469316, + "language_loss": 0.84009999, + "learning_rate": 3.992393167486648e-06, + "loss": 0.86330378, + "num_input_tokens_seen": 10237170, + "step": 474, + "time_per_iteration": 3.406655788421631 + }, + { + "auxiliary_loss_clip": 0.01289301, + "auxiliary_loss_mlp": 0.01055027, + "balance_loss_clip": 1.07881534, + "balance_loss_mlp": 1.03964925, + "epoch": 0.05711537305356821, + "flos": 18916197632640.0, + "grad_norm": 2.5701574061421413, + "language_loss": 0.80658674, + "learning_rate": 3.992325141304977e-06, + "loss": 0.83002996, + "num_input_tokens_seen": 10255125, + "step": 475, + "time_per_iteration": 2.5102367401123047 + }, + { + "auxiliary_loss_clip": 0.01224152, + "auxiliary_loss_mlp": 0.01048825, + "balance_loss_clip": 1.06943631, + "balance_loss_mlp": 1.03480554, + "epoch": 0.0572356159442073, + "flos": 26758979879040.0, + "grad_norm": 2.231398342184023, + "language_loss": 0.86522353, + "learning_rate": 3.992256812889166e-06, + "loss": 0.88795334, + "num_input_tokens_seen": 10271230, + "step": 476, + "time_per_iteration": 2.6430747509002686 + }, + { + "auxiliary_loss_clip": 0.01287087, + "auxiliary_loss_mlp": 0.01047905, + "balance_loss_clip": 1.07895398, + "balance_loss_mlp": 1.03381395, + "epoch": 0.05735585883484639, + "flos": 35116146840960.0, + "grad_norm": 4.912103906002928, + "language_loss": 0.76712555, + "learning_rate": 3.992188182249582e-06, + "loss": 0.79047549, + "num_input_tokens_seen": 10293125, + "step": 477, + "time_per_iteration": 2.6121628284454346 + }, + { + "auxiliary_loss_clip": 0.01252452, + "auxiliary_loss_mlp": 0.01055691, + "balance_loss_clip": 1.0760603, + "balance_loss_mlp": 1.04084325, + "epoch": 0.05747610172548548, + "flos": 18734381965440.0, + "grad_norm": 7.740941948675379, + "language_loss": 0.90576172, + "learning_rate": 3.992119249396633e-06, + "loss": 0.92884308, + "num_input_tokens_seen": 10311810, + "step": 478, + "time_per_iteration": 2.526930332183838 + }, + { + "auxiliary_loss_clip": 0.01243971, + "auxiliary_loss_mlp": 0.00766679, + "balance_loss_clip": 1.06933284, + "balance_loss_mlp": 1.000741, + "epoch": 0.05759634461612457, + "flos": 27964752554880.0, + "grad_norm": 1.9002993415101694, + "language_loss": 0.82179916, + "learning_rate": 3.992050014340778e-06, + "loss": 0.84190571, + "num_input_tokens_seen": 10332165, + "step": 479, + "time_per_iteration": 3.464747190475464 + }, + { + "auxiliary_loss_clip": 0.01147847, + "auxiliary_loss_mlp": 0.01009889, + "balance_loss_clip": 1.03669572, + "balance_loss_mlp": 1.00461996, + "epoch": 0.057716587506763666, + "flos": 69292009405440.0, + "grad_norm": 0.8640049590043475, + "language_loss": 0.55069149, + "learning_rate": 3.99198047709252e-06, + "loss": 0.57226884, + "num_input_tokens_seen": 10393685, + "step": 480, + "time_per_iteration": 3.161722183227539 + }, + { + "auxiliary_loss_clip": 0.01232759, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.06522787, + "balance_loss_mlp": 1.03571713, + "epoch": 0.057836830397402755, + "flos": 25009196745600.0, + "grad_norm": 1.8616834802833198, + "language_loss": 0.78794473, + "learning_rate": 3.991910637662408e-06, + "loss": 0.81077951, + "num_input_tokens_seen": 10413975, + "step": 481, + "time_per_iteration": 3.4052248001098633 + }, + { + "auxiliary_loss_clip": 0.01285052, + "auxiliary_loss_mlp": 0.01040041, + "balance_loss_clip": 1.07858062, + "balance_loss_mlp": 1.02562273, + "epoch": 0.057957073288041845, + "flos": 25593894334080.0, + "grad_norm": 1.8138002061911072, + "language_loss": 0.80608875, + "learning_rate": 3.9918404960610355e-06, + "loss": 0.82933968, + "num_input_tokens_seen": 10433005, + "step": 482, + "time_per_iteration": 2.5188021659851074 + }, + { + "auxiliary_loss_clip": 0.01277709, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.07918572, + "balance_loss_mlp": 1.03740668, + "epoch": 0.058077316178680935, + "flos": 20777411733120.0, + "grad_norm": 2.261458503024001, + "language_loss": 0.77691972, + "learning_rate": 3.991770052299043e-06, + "loss": 0.80021298, + "num_input_tokens_seen": 10451235, + "step": 483, + "time_per_iteration": 2.569852828979492 + }, + { + "auxiliary_loss_clip": 0.01250886, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.07015252, + "balance_loss_mlp": 1.0271039, + "epoch": 0.058197559069320025, + "flos": 18916484941440.0, + "grad_norm": 2.1821964473996855, + "language_loss": 0.87601757, + "learning_rate": 3.991699306387118e-06, + "loss": 0.89892948, + "num_input_tokens_seen": 10469705, + "step": 484, + "time_per_iteration": 2.537954330444336 + }, + { + "auxiliary_loss_clip": 0.01269438, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.07506013, + "balance_loss_mlp": 1.03837669, + "epoch": 0.058317801959959115, + "flos": 24863327614080.0, + "grad_norm": 1.8435456857959243, + "language_loss": 0.78118783, + "learning_rate": 3.991628258335991e-06, + "loss": 0.8044073, + "num_input_tokens_seen": 10491910, + "step": 485, + "time_per_iteration": 2.5575599670410156 + }, + { + "auxiliary_loss_clip": 0.01228738, + "auxiliary_loss_mlp": 0.01045503, + "balance_loss_clip": 1.06771517, + "balance_loss_mlp": 1.03119779, + "epoch": 0.05843804485059821, + "flos": 23257977068160.0, + "grad_norm": 3.9403707254477287, + "language_loss": 0.8729986, + "learning_rate": 3.991556908156442e-06, + "loss": 0.89574105, + "num_input_tokens_seen": 10508435, + "step": 486, + "time_per_iteration": 2.5650887489318848 + }, + { + "auxiliary_loss_clip": 0.01256629, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_clip": 1.07344568, + "balance_loss_mlp": 1.03997207, + "epoch": 0.0585582877412373, + "flos": 23150532510720.0, + "grad_norm": 2.5836211887872502, + "language_loss": 0.87835133, + "learning_rate": 3.9914852558592914e-06, + "loss": 0.90145564, + "num_input_tokens_seen": 10529485, + "step": 487, + "time_per_iteration": 2.5938074588775635 + }, + { + "auxiliary_loss_clip": 0.01269577, + "auxiliary_loss_mlp": 0.01045475, + "balance_loss_clip": 1.0771538, + "balance_loss_mlp": 1.03089583, + "epoch": 0.05867853063187639, + "flos": 23506406507520.0, + "grad_norm": 3.245791908158948, + "language_loss": 0.80608124, + "learning_rate": 3.991413301455413e-06, + "loss": 0.82923174, + "num_input_tokens_seen": 10545935, + "step": 488, + "time_per_iteration": 2.5468804836273193 + }, + { + "auxiliary_loss_clip": 0.01237281, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.06981635, + "balance_loss_mlp": 1.0305438, + "epoch": 0.05879877352251548, + "flos": 29495803818240.0, + "grad_norm": 2.8241493027910707, + "language_loss": 0.78130937, + "learning_rate": 3.991341044955719e-06, + "loss": 0.80411935, + "num_input_tokens_seen": 10565690, + "step": 489, + "time_per_iteration": 2.7312850952148438 + }, + { + "auxiliary_loss_clip": 0.01265329, + "auxiliary_loss_mlp": 0.00767242, + "balance_loss_clip": 1.0730257, + "balance_loss_mlp": 1.00077808, + "epoch": 0.05891901641315457, + "flos": 20157485880960.0, + "grad_norm": 1.960062980502636, + "language_loss": 0.81499922, + "learning_rate": 3.991268486371172e-06, + "loss": 0.83532488, + "num_input_tokens_seen": 10584245, + "step": 490, + "time_per_iteration": 2.597562074661255 + }, + { + "auxiliary_loss_clip": 0.01251835, + "auxiliary_loss_mlp": 0.01052381, + "balance_loss_clip": 1.07094288, + "balance_loss_mlp": 1.03585863, + "epoch": 0.05903925930379366, + "flos": 24644200694400.0, + "grad_norm": 2.4332300998667087, + "language_loss": 0.87856263, + "learning_rate": 3.991195625712779e-06, + "loss": 0.90160477, + "num_input_tokens_seen": 10601210, + "step": 491, + "time_per_iteration": 2.646657943725586 + }, + { + "auxiliary_loss_clip": 0.01283379, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.07858241, + "balance_loss_mlp": 1.03049469, + "epoch": 0.05915950219443276, + "flos": 21250391045760.0, + "grad_norm": 1.9560043557368332, + "language_loss": 0.81519854, + "learning_rate": 3.991122462991592e-06, + "loss": 0.83848155, + "num_input_tokens_seen": 10620730, + "step": 492, + "time_per_iteration": 2.55846905708313 + }, + { + "auxiliary_loss_clip": 0.01288231, + "auxiliary_loss_mlp": 0.01050067, + "balance_loss_clip": 1.07664061, + "balance_loss_mlp": 1.03595257, + "epoch": 0.05927974508507185, + "flos": 9902727319680.0, + "grad_norm": 4.138479272791149, + "language_loss": 0.81250608, + "learning_rate": 3.991048998218712e-06, + "loss": 0.8358891, + "num_input_tokens_seen": 10634035, + "step": 493, + "time_per_iteration": 2.4798851013183594 + }, + { + "auxiliary_loss_clip": 0.01265389, + "auxiliary_loss_mlp": 0.01050066, + "balance_loss_clip": 1.07141185, + "balance_loss_mlp": 1.0360117, + "epoch": 0.05939998797571094, + "flos": 18259499232000.0, + "grad_norm": 2.6924464145983067, + "language_loss": 0.76272655, + "learning_rate": 3.990975231405281e-06, + "loss": 0.78588104, + "num_input_tokens_seen": 10652485, + "step": 494, + "time_per_iteration": 2.540430784225464 + }, + { + "auxiliary_loss_clip": 0.01264716, + "auxiliary_loss_mlp": 0.01046595, + "balance_loss_clip": 1.0755918, + "balance_loss_mlp": 1.03237343, + "epoch": 0.05952023086635003, + "flos": 28256598558720.0, + "grad_norm": 1.7629613226632177, + "language_loss": 0.78670263, + "learning_rate": 3.990901162562491e-06, + "loss": 0.80981576, + "num_input_tokens_seen": 10673175, + "step": 495, + "time_per_iteration": 2.63199782371521 + }, + { + "auxiliary_loss_clip": 0.01227425, + "auxiliary_loss_mlp": 0.00767761, + "balance_loss_clip": 1.06459618, + "balance_loss_mlp": 1.00068176, + "epoch": 0.05964047375698912, + "flos": 14902498045440.0, + "grad_norm": 2.473320868503803, + "language_loss": 0.90764403, + "learning_rate": 3.9908267917015765e-06, + "loss": 0.92759585, + "num_input_tokens_seen": 10691235, + "step": 496, + "time_per_iteration": 2.6073436737060547 + }, + { + "auxiliary_loss_clip": 0.01254178, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.06982493, + "balance_loss_mlp": 1.04096711, + "epoch": 0.059760716647628206, + "flos": 23185581206400.0, + "grad_norm": 2.4768764183403613, + "language_loss": 0.93161809, + "learning_rate": 3.990752118833821e-06, + "loss": 0.95471656, + "num_input_tokens_seen": 10708675, + "step": 497, + "time_per_iteration": 2.5714454650878906 + }, + { + "auxiliary_loss_clip": 0.01283475, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.07747734, + "balance_loss_mlp": 1.03153205, + "epoch": 0.0598809595382673, + "flos": 22746968231040.0, + "grad_norm": 1.816147997487087, + "language_loss": 0.78227592, + "learning_rate": 3.990677143970553e-06, + "loss": 0.80556679, + "num_input_tokens_seen": 10729485, + "step": 498, + "time_per_iteration": 2.589621067047119 + }, + { + "auxiliary_loss_clip": 0.01230367, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_clip": 1.07252336, + "balance_loss_mlp": 1.0402143, + "epoch": 0.06000120242890639, + "flos": 22127221946880.0, + "grad_norm": 2.3668604459595377, + "language_loss": 0.81221884, + "learning_rate": 3.990601867123144e-06, + "loss": 0.83507574, + "num_input_tokens_seen": 10749210, + "step": 499, + "time_per_iteration": 2.6350436210632324 + }, + { + "auxiliary_loss_clip": 0.0121815, + "auxiliary_loss_mlp": 0.01049183, + "balance_loss_clip": 1.06963706, + "balance_loss_mlp": 1.03477049, + "epoch": 0.06012144531954548, + "flos": 19171773878400.0, + "grad_norm": 2.065752786043113, + "language_loss": 0.84776658, + "learning_rate": 3.990526288303014e-06, + "loss": 0.87043989, + "num_input_tokens_seen": 10768000, + "step": 500, + "time_per_iteration": 2.630920171737671 + }, + { + "auxiliary_loss_clip": 0.01245623, + "auxiliary_loss_mlp": 0.00766268, + "balance_loss_clip": 1.07011294, + "balance_loss_mlp": 1.00060201, + "epoch": 0.06024168821018457, + "flos": 22783345729920.0, + "grad_norm": 1.711959340986966, + "language_loss": 0.91168308, + "learning_rate": 3.9904504075216295e-06, + "loss": 0.93180203, + "num_input_tokens_seen": 10788760, + "step": 501, + "time_per_iteration": 3.4149577617645264 + }, + { + "auxiliary_loss_clip": 0.01232014, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.06641507, + "balance_loss_mlp": 1.03676319, + "epoch": 0.06036193110082366, + "flos": 18770687637120.0, + "grad_norm": 2.0796848827466285, + "language_loss": 0.94184071, + "learning_rate": 3.990374224790501e-06, + "loss": 0.96467751, + "num_input_tokens_seen": 10806965, + "step": 502, + "time_per_iteration": 2.604987621307373 + }, + { + "auxiliary_loss_clip": 0.01248037, + "auxiliary_loss_mlp": 0.01052614, + "balance_loss_clip": 1.07285142, + "balance_loss_mlp": 1.03800464, + "epoch": 0.06048217399146275, + "flos": 17201570935680.0, + "grad_norm": 1.8303229565552828, + "language_loss": 0.70841467, + "learning_rate": 3.990297740121185e-06, + "loss": 0.73142111, + "num_input_tokens_seen": 10824900, + "step": 503, + "time_per_iteration": 2.5631203651428223 + }, + { + "auxiliary_loss_clip": 0.0126291, + "auxiliary_loss_mlp": 0.00766852, + "balance_loss_clip": 1.07250094, + "balance_loss_mlp": 1.00057173, + "epoch": 0.06060241688210185, + "flos": 24024131187840.0, + "grad_norm": 1.7632762460295015, + "language_loss": 0.78359771, + "learning_rate": 3.990220953525284e-06, + "loss": 0.80389529, + "num_input_tokens_seen": 10842010, + "step": 504, + "time_per_iteration": 2.5622775554656982 + }, + { + "auxiliary_loss_clip": 0.01236032, + "auxiliary_loss_mlp": 0.01049064, + "balance_loss_clip": 1.06633329, + "balance_loss_mlp": 1.03552783, + "epoch": 0.06072265977274094, + "flos": 14611190745600.0, + "grad_norm": 3.417936454705504, + "language_loss": 0.74035621, + "learning_rate": 3.9901438650144465e-06, + "loss": 0.76320714, + "num_input_tokens_seen": 10858260, + "step": 505, + "time_per_iteration": 3.3221349716186523 + }, + { + "auxiliary_loss_clip": 0.01255656, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_clip": 1.07094288, + "balance_loss_mlp": 1.03138399, + "epoch": 0.06084290266338003, + "flos": 20558284813440.0, + "grad_norm": 4.427473181387212, + "language_loss": 0.91775775, + "learning_rate": 3.990066474600367e-06, + "loss": 0.94075811, + "num_input_tokens_seen": 10876230, + "step": 506, + "time_per_iteration": 3.4471898078918457 + }, + { + "auxiliary_loss_clip": 0.01248885, + "auxiliary_loss_mlp": 0.01047921, + "balance_loss_clip": 1.06613219, + "balance_loss_mlp": 1.03276944, + "epoch": 0.06096314555401912, + "flos": 22309217182080.0, + "grad_norm": 2.097791928212606, + "language_loss": 0.67857343, + "learning_rate": 3.989988782294786e-06, + "loss": 0.70154148, + "num_input_tokens_seen": 10896320, + "step": 507, + "time_per_iteration": 2.5640347003936768 + }, + { + "auxiliary_loss_clip": 0.01216106, + "auxiliary_loss_mlp": 0.01053604, + "balance_loss_clip": 1.06556737, + "balance_loss_mlp": 1.03891158, + "epoch": 0.06108338844465821, + "flos": 19131374056320.0, + "grad_norm": 1.7265108818948975, + "language_loss": 0.95145917, + "learning_rate": 3.989910788109489e-06, + "loss": 0.97415626, + "num_input_tokens_seen": 10912970, + "step": 508, + "time_per_iteration": 3.319075584411621 + }, + { + "auxiliary_loss_clip": 0.01225846, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.06460392, + "balance_loss_mlp": 1.03205013, + "epoch": 0.0612036313352973, + "flos": 33584018169600.0, + "grad_norm": 4.653434425353272, + "language_loss": 0.75043541, + "learning_rate": 3.989832492056307e-06, + "loss": 0.77315158, + "num_input_tokens_seen": 10933995, + "step": 509, + "time_per_iteration": 2.7116730213165283 + }, + { + "auxiliary_loss_clip": 0.0126432, + "auxiliary_loss_mlp": 0.01049766, + "balance_loss_clip": 1.07376266, + "balance_loss_mlp": 1.03516316, + "epoch": 0.06132387422593639, + "flos": 27490552179840.0, + "grad_norm": 2.30611056388382, + "language_loss": 0.8086307, + "learning_rate": 3.989753894147119e-06, + "loss": 0.83177161, + "num_input_tokens_seen": 10954120, + "step": 510, + "time_per_iteration": 2.5530357360839844 + }, + { + "auxiliary_loss_clip": 0.01258622, + "auxiliary_loss_mlp": 0.01046003, + "balance_loss_clip": 1.07547843, + "balance_loss_mlp": 1.03217459, + "epoch": 0.061444117116575485, + "flos": 25885057979520.0, + "grad_norm": 1.8684002224500142, + "language_loss": 0.79945993, + "learning_rate": 3.989674994393846e-06, + "loss": 0.82250619, + "num_input_tokens_seen": 10973595, + "step": 511, + "time_per_iteration": 2.58402943611145 + }, + { + "auxiliary_loss_clip": 0.01262331, + "auxiliary_loss_mlp": 0.01041539, + "balance_loss_clip": 1.07509351, + "balance_loss_mlp": 1.02762151, + "epoch": 0.061564360007214575, + "flos": 28512031150080.0, + "grad_norm": 2.1365770111365485, + "language_loss": 0.94072628, + "learning_rate": 3.98959579280846e-06, + "loss": 0.96376503, + "num_input_tokens_seen": 10991995, + "step": 512, + "time_per_iteration": 2.5635733604431152 + }, + { + "auxiliary_loss_clip": 0.01196261, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.06744456, + "balance_loss_mlp": 1.03076339, + "epoch": 0.061684602897853665, + "flos": 12094355652480.0, + "grad_norm": 2.2795350665917007, + "language_loss": 0.83073515, + "learning_rate": 3.989516289402973e-06, + "loss": 0.85314929, + "num_input_tokens_seen": 11007625, + "step": 513, + "time_per_iteration": 2.649857759475708 + }, + { + "auxiliary_loss_clip": 0.01177261, + "auxiliary_loss_mlp": 0.01048162, + "balance_loss_clip": 1.0571543, + "balance_loss_mlp": 1.03379762, + "epoch": 0.061804845788492754, + "flos": 19532639865600.0, + "grad_norm": 2.8027560640377995, + "language_loss": 0.80464005, + "learning_rate": 3.989436484189447e-06, + "loss": 0.82689428, + "num_input_tokens_seen": 11025570, + "step": 514, + "time_per_iteration": 2.6532979011535645 + }, + { + "auxiliary_loss_clip": 0.01263404, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_clip": 1.06956124, + "balance_loss_mlp": 1.02771306, + "epoch": 0.061925088679131844, + "flos": 15341111020800.0, + "grad_norm": 2.9314761155770612, + "language_loss": 0.80997372, + "learning_rate": 3.9893563771799885e-06, + "loss": 0.83302486, + "num_input_tokens_seen": 11042045, + "step": 515, + "time_per_iteration": 2.526691198348999 + }, + { + "auxiliary_loss_clip": 0.01279233, + "auxiliary_loss_mlp": 0.01048377, + "balance_loss_clip": 1.07494998, + "balance_loss_mlp": 1.03396511, + "epoch": 0.062045331569770934, + "flos": 25919927107200.0, + "grad_norm": 2.3013100180536035, + "language_loss": 0.85945821, + "learning_rate": 3.989275968386749e-06, + "loss": 0.8827343, + "num_input_tokens_seen": 11059955, + "step": 516, + "time_per_iteration": 2.540703773498535 + }, + { + "auxiliary_loss_clip": 0.01238818, + "auxiliary_loss_mlp": 0.01051251, + "balance_loss_clip": 1.06632495, + "balance_loss_mlp": 1.03590882, + "epoch": 0.06216557446041003, + "flos": 28110621686400.0, + "grad_norm": 2.2433027139282955, + "language_loss": 0.76641953, + "learning_rate": 3.989195257821926e-06, + "loss": 0.78932023, + "num_input_tokens_seen": 11078440, + "step": 517, + "time_per_iteration": 2.687325954437256 + }, + { + "auxiliary_loss_clip": 0.01242201, + "auxiliary_loss_mlp": 0.01050637, + "balance_loss_clip": 1.07084441, + "balance_loss_mlp": 1.03628373, + "epoch": 0.06228581735104912, + "flos": 23478181395840.0, + "grad_norm": 2.1961413645515186, + "language_loss": 0.84141314, + "learning_rate": 3.989114245497765e-06, + "loss": 0.86434156, + "num_input_tokens_seen": 11098240, + "step": 518, + "time_per_iteration": 2.602976083755493 + }, + { + "auxiliary_loss_clip": 0.01261688, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.06707501, + "balance_loss_mlp": 1.02932966, + "epoch": 0.06240606024168821, + "flos": 15195205975680.0, + "grad_norm": 2.5054984938972042, + "language_loss": 0.94887328, + "learning_rate": 3.989032931426554e-06, + "loss": 0.97192097, + "num_input_tokens_seen": 11115395, + "step": 519, + "time_per_iteration": 2.5378425121307373 + }, + { + "auxiliary_loss_clip": 0.01237419, + "auxiliary_loss_mlp": 0.01044545, + "balance_loss_clip": 1.06759143, + "balance_loss_mlp": 1.03063893, + "epoch": 0.06252630313232731, + "flos": 20631829910400.0, + "grad_norm": 2.209568759441187, + "language_loss": 0.86931837, + "learning_rate": 3.9889513156206295e-06, + "loss": 0.892138, + "num_input_tokens_seen": 11134835, + "step": 520, + "time_per_iteration": 2.536693572998047 + }, + { + "auxiliary_loss_clip": 0.01234608, + "auxiliary_loss_mlp": 0.01047263, + "balance_loss_clip": 1.06920779, + "balance_loss_mlp": 1.03227854, + "epoch": 0.06264654602296639, + "flos": 20778058177920.0, + "grad_norm": 2.9212118464256167, + "language_loss": 0.73774111, + "learning_rate": 3.988869398092371e-06, + "loss": 0.7605598, + "num_input_tokens_seen": 11154745, + "step": 521, + "time_per_iteration": 2.6357240676879883 + }, + { + "auxiliary_loss_clip": 0.01246808, + "auxiliary_loss_mlp": 0.01047027, + "balance_loss_clip": 1.07113934, + "balance_loss_mlp": 1.03257942, + "epoch": 0.06276678891360549, + "flos": 29605798241280.0, + "grad_norm": 2.363192753656143, + "language_loss": 0.79168439, + "learning_rate": 3.988787178854206e-06, + "loss": 0.81462276, + "num_input_tokens_seen": 11174280, + "step": 522, + "time_per_iteration": 2.5789244174957275 + }, + { + "auxiliary_loss_clip": 0.01278253, + "auxiliary_loss_mlp": 0.01048427, + "balance_loss_clip": 1.07506073, + "balance_loss_mlp": 1.03434825, + "epoch": 0.06288703180424457, + "flos": 22126288193280.0, + "grad_norm": 2.292919083044179, + "language_loss": 0.8775146, + "learning_rate": 3.988704657918608e-06, + "loss": 0.90078139, + "num_input_tokens_seen": 11193340, + "step": 523, + "time_per_iteration": 2.5421876907348633 + }, + { + "auxiliary_loss_clip": 0.01260593, + "auxiliary_loss_mlp": 0.01053341, + "balance_loss_clip": 1.07486653, + "balance_loss_mlp": 1.03994203, + "epoch": 0.06300727469488367, + "flos": 14976689587200.0, + "grad_norm": 2.8851721886215547, + "language_loss": 0.80009913, + "learning_rate": 3.988621835298094e-06, + "loss": 0.82323843, + "num_input_tokens_seen": 11210555, + "step": 524, + "time_per_iteration": 2.4824345111846924 + }, + { + "auxiliary_loss_clip": 0.01274082, + "auxiliary_loss_mlp": 0.01045944, + "balance_loss_clip": 1.07479048, + "balance_loss_mlp": 1.0320915, + "epoch": 0.06312751758552275, + "flos": 24535391420160.0, + "grad_norm": 1.9399650971274744, + "language_loss": 0.91913724, + "learning_rate": 3.988538711005229e-06, + "loss": 0.94233757, + "num_input_tokens_seen": 11230010, + "step": 525, + "time_per_iteration": 2.5533370971679688 + }, + { + "auxiliary_loss_clip": 0.01254739, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_clip": 1.07180393, + "balance_loss_mlp": 1.03244102, + "epoch": 0.06324776047616185, + "flos": 21507008785920.0, + "grad_norm": 2.257703989251811, + "language_loss": 0.88241124, + "learning_rate": 3.988455285052622e-06, + "loss": 0.90541208, + "num_input_tokens_seen": 11246190, + "step": 526, + "time_per_iteration": 2.486172914505005 + }, + { + "auxiliary_loss_clip": 0.01257162, + "auxiliary_loss_mlp": 0.010524, + "balance_loss_clip": 1.07308877, + "balance_loss_mlp": 1.03854191, + "epoch": 0.06336800336680094, + "flos": 21688034353920.0, + "grad_norm": 2.0333813020777622, + "language_loss": 0.84032071, + "learning_rate": 3.98837155745293e-06, + "loss": 0.86341631, + "num_input_tokens_seen": 11264230, + "step": 527, + "time_per_iteration": 2.557490825653076 + }, + { + "auxiliary_loss_clip": 0.01263769, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.07662797, + "balance_loss_mlp": 1.03129542, + "epoch": 0.06348824625744003, + "flos": 19500895221120.0, + "grad_norm": 2.0420687094624435, + "language_loss": 0.7600944, + "learning_rate": 3.988287528218854e-06, + "loss": 0.78318799, + "num_input_tokens_seen": 11283015, + "step": 528, + "time_per_iteration": 3.268552303314209 + }, + { + "auxiliary_loss_clip": 0.01259022, + "auxiliary_loss_mlp": 0.01044304, + "balance_loss_clip": 1.07505417, + "balance_loss_mlp": 1.0313158, + "epoch": 0.06360848914807912, + "flos": 15481233976320.0, + "grad_norm": 2.07627932200372, + "language_loss": 0.90610939, + "learning_rate": 3.98820319736314e-06, + "loss": 0.92914271, + "num_input_tokens_seen": 11299630, + "step": 529, + "time_per_iteration": 2.5093588829040527 + }, + { + "auxiliary_loss_clip": 0.01228938, + "auxiliary_loss_mlp": 0.01044071, + "balance_loss_clip": 1.06564808, + "balance_loss_mlp": 1.03015924, + "epoch": 0.0637287320387182, + "flos": 20593369422720.0, + "grad_norm": 1.8016543805090741, + "language_loss": 0.85254645, + "learning_rate": 3.988118564898582e-06, + "loss": 0.87527651, + "num_input_tokens_seen": 11319170, + "step": 530, + "time_per_iteration": 2.573732852935791 + }, + { + "auxiliary_loss_clip": 0.01223118, + "auxiliary_loss_mlp": 0.0076726, + "balance_loss_clip": 1.07045996, + "balance_loss_mlp": 1.00039816, + "epoch": 0.0638489749293573, + "flos": 17412222245760.0, + "grad_norm": 2.2854967834248274, + "language_loss": 0.89324105, + "learning_rate": 3.988033630838019e-06, + "loss": 0.91314483, + "num_input_tokens_seen": 11333210, + "step": 531, + "time_per_iteration": 2.605701446533203 + }, + { + "auxiliary_loss_clip": 0.01263484, + "auxiliary_loss_mlp": 0.0105179, + "balance_loss_clip": 1.07501411, + "balance_loss_mlp": 1.03842676, + "epoch": 0.0639692178199964, + "flos": 23807661874560.0, + "grad_norm": 1.6475820456493724, + "language_loss": 0.88092899, + "learning_rate": 3.987948395194334e-06, + "loss": 0.9040817, + "num_input_tokens_seen": 11355590, + "step": 532, + "time_per_iteration": 4.175787925720215 + }, + { + "auxiliary_loss_clip": 0.01250721, + "auxiliary_loss_mlp": 0.01052487, + "balance_loss_clip": 1.06814122, + "balance_loss_mlp": 1.03903401, + "epoch": 0.06408946071063548, + "flos": 18477225521280.0, + "grad_norm": 2.1057366672911475, + "language_loss": 0.76576364, + "learning_rate": 3.987862857980458e-06, + "loss": 0.78879577, + "num_input_tokens_seen": 11371535, + "step": 533, + "time_per_iteration": 2.5321125984191895 + }, + { + "auxiliary_loss_clip": 0.01228782, + "auxiliary_loss_mlp": 0.01045632, + "balance_loss_clip": 1.06811428, + "balance_loss_mlp": 1.03112459, + "epoch": 0.06420970360127458, + "flos": 27162220936320.0, + "grad_norm": 2.634596840346206, + "language_loss": 0.76803386, + "learning_rate": 3.987777019209368e-06, + "loss": 0.79077792, + "num_input_tokens_seen": 11392050, + "step": 534, + "time_per_iteration": 3.3541109561920166 + }, + { + "auxiliary_loss_clip": 0.01278138, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.07637906, + "balance_loss_mlp": 1.02886152, + "epoch": 0.06432994649191366, + "flos": 23659673840640.0, + "grad_norm": 4.810119883205886, + "language_loss": 0.81286293, + "learning_rate": 3.987690878894084e-06, + "loss": 0.83607042, + "num_input_tokens_seen": 11411765, + "step": 535, + "time_per_iteration": 2.5653088092803955 + }, + { + "auxiliary_loss_clip": 0.01250204, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.07265449, + "balance_loss_mlp": 1.02244473, + "epoch": 0.06445018938255276, + "flos": 23403953940480.0, + "grad_norm": 2.6439322320632006, + "language_loss": 0.85333127, + "learning_rate": 3.987604437047673e-06, + "loss": 0.8761965, + "num_input_tokens_seen": 11431565, + "step": 536, + "time_per_iteration": 2.558048725128174 + }, + { + "auxiliary_loss_clip": 0.01256634, + "auxiliary_loss_mlp": 0.01043786, + "balance_loss_clip": 1.07157469, + "balance_loss_mlp": 1.03023195, + "epoch": 0.06457043227319184, + "flos": 19646692525440.0, + "grad_norm": 2.2994892071675324, + "language_loss": 0.77566165, + "learning_rate": 3.987517693683251e-06, + "loss": 0.79866588, + "num_input_tokens_seen": 11450140, + "step": 537, + "time_per_iteration": 2.570960760116577 + }, + { + "auxiliary_loss_clip": 0.01240312, + "auxiliary_loss_mlp": 0.01056201, + "balance_loss_clip": 1.07161307, + "balance_loss_mlp": 1.04209828, + "epoch": 0.06469067516383094, + "flos": 16978744915200.0, + "grad_norm": 3.178924399386921, + "language_loss": 0.9600358, + "learning_rate": 3.9874306488139745e-06, + "loss": 0.98300099, + "num_input_tokens_seen": 11465400, + "step": 538, + "time_per_iteration": 2.5246057510375977 + }, + { + "auxiliary_loss_clip": 0.01225697, + "auxiliary_loss_mlp": 0.01047211, + "balance_loss_clip": 1.06984317, + "balance_loss_mlp": 1.03340697, + "epoch": 0.06481091805447003, + "flos": 23296401642240.0, + "grad_norm": 2.022463506911064, + "language_loss": 0.87789249, + "learning_rate": 3.987343302453049e-06, + "loss": 0.90062153, + "num_input_tokens_seen": 11486675, + "step": 539, + "time_per_iteration": 2.6185922622680664 + }, + { + "auxiliary_loss_clip": 0.01242706, + "auxiliary_loss_mlp": 0.0104951, + "balance_loss_clip": 1.07214081, + "balance_loss_mlp": 1.03572941, + "epoch": 0.06493116094510912, + "flos": 29172356824320.0, + "grad_norm": 1.646683429518235, + "language_loss": 0.82634234, + "learning_rate": 3.987255654613724e-06, + "loss": 0.8492645, + "num_input_tokens_seen": 11510440, + "step": 540, + "time_per_iteration": 2.6307239532470703 + }, + { + "auxiliary_loss_clip": 0.01220481, + "auxiliary_loss_mlp": 0.01046343, + "balance_loss_clip": 1.06522894, + "balance_loss_mlp": 1.03256226, + "epoch": 0.06505140383574821, + "flos": 19865065259520.0, + "grad_norm": 2.8280155344709574, + "language_loss": 0.70145231, + "learning_rate": 3.987167705309296e-06, + "loss": 0.72412056, + "num_input_tokens_seen": 11529715, + "step": 541, + "time_per_iteration": 2.5936086177825928 + }, + { + "auxiliary_loss_clip": 0.01259629, + "auxiliary_loss_mlp": 0.00766089, + "balance_loss_clip": 1.07191896, + "balance_loss_mlp": 1.00040436, + "epoch": 0.0651716467263873, + "flos": 17924703540480.0, + "grad_norm": 2.014998082808893, + "language_loss": 0.95142359, + "learning_rate": 3.987079454553108e-06, + "loss": 0.97168076, + "num_input_tokens_seen": 11547665, + "step": 542, + "time_per_iteration": 2.5201218128204346 + }, + { + "auxiliary_loss_clip": 0.01223939, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.07018447, + "balance_loss_mlp": 1.02989042, + "epoch": 0.0652918896170264, + "flos": 20842840356480.0, + "grad_norm": 1.8811402526990344, + "language_loss": 0.91133386, + "learning_rate": 3.986990902358546e-06, + "loss": 0.93400711, + "num_input_tokens_seen": 11564605, + "step": 543, + "time_per_iteration": 2.5884342193603516 + }, + { + "auxiliary_loss_clip": 0.01259024, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.07132578, + "balance_loss_mlp": 1.03453207, + "epoch": 0.06541213250766549, + "flos": 21872507627520.0, + "grad_norm": 2.4480662023516198, + "language_loss": 0.93326581, + "learning_rate": 3.986902048739045e-06, + "loss": 0.95633876, + "num_input_tokens_seen": 11584550, + "step": 544, + "time_per_iteration": 2.540523052215576 + }, + { + "auxiliary_loss_clip": 0.01244425, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_clip": 1.07057416, + "balance_loss_mlp": 1.03486204, + "epoch": 0.06553237539830457, + "flos": 23110743219840.0, + "grad_norm": 2.685322106617403, + "language_loss": 0.79816675, + "learning_rate": 3.986812893708082e-06, + "loss": 0.82110775, + "num_input_tokens_seen": 11600740, + "step": 545, + "time_per_iteration": 2.569758415222168 + }, + { + "auxiliary_loss_clip": 0.01242421, + "auxiliary_loss_mlp": 0.01048732, + "balance_loss_clip": 1.06779075, + "balance_loss_mlp": 1.03405702, + "epoch": 0.06565261828894367, + "flos": 17923769786880.0, + "grad_norm": 2.3005544598330743, + "language_loss": 0.81333226, + "learning_rate": 3.9867234372791826e-06, + "loss": 0.83624381, + "num_input_tokens_seen": 11618695, + "step": 546, + "time_per_iteration": 2.511117935180664 + }, + { + "auxiliary_loss_clip": 0.01255258, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.07118583, + "balance_loss_mlp": 1.03232718, + "epoch": 0.06577286117958275, + "flos": 22783058421120.0, + "grad_norm": 3.6692112474506358, + "language_loss": 0.87165922, + "learning_rate": 3.986633679465918e-06, + "loss": 0.89467299, + "num_input_tokens_seen": 11638850, + "step": 547, + "time_per_iteration": 2.5443801879882812 + }, + { + "auxiliary_loss_clip": 0.01211543, + "auxiliary_loss_mlp": 0.01052541, + "balance_loss_clip": 1.06769454, + "balance_loss_mlp": 1.038939, + "epoch": 0.06589310407022185, + "flos": 23696194993920.0, + "grad_norm": 2.111359443670224, + "language_loss": 0.80513996, + "learning_rate": 3.986543620281904e-06, + "loss": 0.82778072, + "num_input_tokens_seen": 11658500, + "step": 548, + "time_per_iteration": 2.636155366897583 + }, + { + "auxiliary_loss_clip": 0.01223896, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.06502867, + "balance_loss_mlp": 1.02123213, + "epoch": 0.06601334696086093, + "flos": 26864772410880.0, + "grad_norm": 1.927825081736006, + "language_loss": 0.91351926, + "learning_rate": 3.986453259740802e-06, + "loss": 0.93610793, + "num_input_tokens_seen": 11676670, + "step": 549, + "time_per_iteration": 2.6165995597839355 + }, + { + "auxiliary_loss_clip": 0.01240177, + "auxiliary_loss_mlp": 0.01049228, + "balance_loss_clip": 1.07355571, + "balance_loss_mlp": 1.03520274, + "epoch": 0.06613358985150003, + "flos": 12567694101120.0, + "grad_norm": 2.7127872046687806, + "language_loss": 0.79120648, + "learning_rate": 3.986362597856319e-06, + "loss": 0.8141005, + "num_input_tokens_seen": 11693170, + "step": 550, + "time_per_iteration": 2.559262990951538 + }, + { + "auxiliary_loss_clip": 0.01237732, + "auxiliary_loss_mlp": 0.00767832, + "balance_loss_clip": 1.06790137, + "balance_loss_mlp": 1.00048375, + "epoch": 0.06625383274213913, + "flos": 18332505624960.0, + "grad_norm": 2.8655993041631063, + "language_loss": 0.81506467, + "learning_rate": 3.986271634642211e-06, + "loss": 0.83512026, + "num_input_tokens_seen": 11710150, + "step": 551, + "time_per_iteration": 2.5514111518859863 + }, + { + "auxiliary_loss_clip": 0.01271234, + "auxiliary_loss_mlp": 0.01046563, + "balance_loss_clip": 1.07411563, + "balance_loss_mlp": 1.03225207, + "epoch": 0.06637407563277821, + "flos": 15375585098880.0, + "grad_norm": 1.99877330424272, + "language_loss": 0.81640399, + "learning_rate": 3.986180370112274e-06, + "loss": 0.83958191, + "num_input_tokens_seen": 11726670, + "step": 552, + "time_per_iteration": 2.4699618816375732 + }, + { + "auxiliary_loss_clip": 0.01257554, + "auxiliary_loss_mlp": 0.00767611, + "balance_loss_clip": 1.07208824, + "balance_loss_mlp": 1.00044155, + "epoch": 0.0664943185234173, + "flos": 24025244509440.0, + "grad_norm": 1.748967635523427, + "language_loss": 0.74675941, + "learning_rate": 3.986088804280354e-06, + "loss": 0.76701099, + "num_input_tokens_seen": 11746400, + "step": 553, + "time_per_iteration": 2.5647308826446533 + }, + { + "auxiliary_loss_clip": 0.012432, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_clip": 1.07146311, + "balance_loss_mlp": 1.03258979, + "epoch": 0.06661456141405639, + "flos": 20957503547520.0, + "grad_norm": 2.530207107619296, + "language_loss": 0.94020468, + "learning_rate": 3.985996937160342e-06, + "loss": 0.96310604, + "num_input_tokens_seen": 11765590, + "step": 554, + "time_per_iteration": 3.3341805934906006 + }, + { + "auxiliary_loss_clip": 0.01251465, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.07013559, + "balance_loss_mlp": 1.03773403, + "epoch": 0.06673480430469549, + "flos": 52223953322880.0, + "grad_norm": 1.9353637869000901, + "language_loss": 0.68715233, + "learning_rate": 3.985904768766173e-06, + "loss": 0.71018022, + "num_input_tokens_seen": 11788365, + "step": 555, + "time_per_iteration": 2.8013808727264404 + }, + { + "auxiliary_loss_clip": 0.0122862, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.06846809, + "balance_loss_mlp": 1.03291917, + "epoch": 0.06685504719533458, + "flos": 16217079995520.0, + "grad_norm": 2.907012445313646, + "language_loss": 0.75966603, + "learning_rate": 3.98581229911183e-06, + "loss": 0.7824263, + "num_input_tokens_seen": 11807285, + "step": 556, + "time_per_iteration": 2.5596134662628174 + }, + { + "auxiliary_loss_clip": 0.01257191, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.06949735, + "balance_loss_mlp": 1.03170407, + "epoch": 0.06697529008597367, + "flos": 22491535639680.0, + "grad_norm": 1.7139576351672947, + "language_loss": 0.91908765, + "learning_rate": 3.985719528211341e-06, + "loss": 0.94211942, + "num_input_tokens_seen": 11826655, + "step": 557, + "time_per_iteration": 2.5499160289764404 + }, + { + "auxiliary_loss_clip": 0.01143971, + "auxiliary_loss_mlp": 0.0100841, + "balance_loss_clip": 1.03919816, + "balance_loss_mlp": 1.00252092, + "epoch": 0.06709553297661276, + "flos": 62688216936960.0, + "grad_norm": 0.8447410932211391, + "language_loss": 0.63014245, + "learning_rate": 3.985626456078777e-06, + "loss": 0.65166628, + "num_input_tokens_seen": 11891310, + "step": 558, + "time_per_iteration": 4.009221076965332 + }, + { + "auxiliary_loss_clip": 0.01228934, + "auxiliary_loss_mlp": 0.01045226, + "balance_loss_clip": 1.07019997, + "balance_loss_mlp": 1.03140962, + "epoch": 0.06721577586725185, + "flos": 11216590997760.0, + "grad_norm": 2.1453007329263425, + "language_loss": 0.86267483, + "learning_rate": 3.985533082728259e-06, + "loss": 0.88541645, + "num_input_tokens_seen": 11906965, + "step": 559, + "time_per_iteration": 3.4730141162872314 + }, + { + "auxiliary_loss_clip": 0.01277107, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.07436895, + "balance_loss_mlp": 1.02359629, + "epoch": 0.06733601875789094, + "flos": 25922189664000.0, + "grad_norm": 1.9090270022415432, + "language_loss": 0.75014746, + "learning_rate": 3.985439408173951e-06, + "loss": 0.77329493, + "num_input_tokens_seen": 11927190, + "step": 560, + "time_per_iteration": 2.5410356521606445 + }, + { + "auxiliary_loss_clip": 0.0127515, + "auxiliary_loss_mlp": 0.0105379, + "balance_loss_clip": 1.0754025, + "balance_loss_mlp": 1.03944969, + "epoch": 0.06745626164853002, + "flos": 20813645577600.0, + "grad_norm": 2.430005953821239, + "language_loss": 0.70951903, + "learning_rate": 3.9853454324300634e-06, + "loss": 0.73280847, + "num_input_tokens_seen": 11946400, + "step": 561, + "time_per_iteration": 3.2623889446258545 + }, + { + "auxiliary_loss_clip": 0.01199154, + "auxiliary_loss_mlp": 0.01042931, + "balance_loss_clip": 1.06361532, + "balance_loss_mlp": 1.0283339, + "epoch": 0.06757650453916912, + "flos": 19829262378240.0, + "grad_norm": 2.075310094903047, + "language_loss": 0.77732491, + "learning_rate": 3.985251155510852e-06, + "loss": 0.7997458, + "num_input_tokens_seen": 11965430, + "step": 562, + "time_per_iteration": 2.6828272342681885 + }, + { + "auxiliary_loss_clip": 0.01209868, + "auxiliary_loss_mlp": 0.01043947, + "balance_loss_clip": 1.07043219, + "balance_loss_mlp": 1.02976108, + "epoch": 0.06769674742980822, + "flos": 25739224761600.0, + "grad_norm": 1.775925135573773, + "language_loss": 0.80364096, + "learning_rate": 3.98515657743062e-06, + "loss": 0.82617909, + "num_input_tokens_seen": 11984895, + "step": 563, + "time_per_iteration": 2.674323320388794 + }, + { + "auxiliary_loss_clip": 0.01235705, + "auxiliary_loss_mlp": 0.01054189, + "balance_loss_clip": 1.06648397, + "balance_loss_mlp": 1.04071283, + "epoch": 0.0678169903204473, + "flos": 13074788355840.0, + "grad_norm": 2.2398220849929085, + "language_loss": 0.77869093, + "learning_rate": 3.985061698203711e-06, + "loss": 0.80158991, + "num_input_tokens_seen": 12002010, + "step": 564, + "time_per_iteration": 2.5196056365966797 + }, + { + "auxiliary_loss_clip": 0.01161138, + "auxiliary_loss_mlp": 0.01009265, + "balance_loss_clip": 1.03621054, + "balance_loss_mlp": 1.00354278, + "epoch": 0.0679372332110864, + "flos": 70865830788480.0, + "grad_norm": 0.8904670521634256, + "language_loss": 0.63779444, + "learning_rate": 3.984966517844523e-06, + "loss": 0.65949851, + "num_input_tokens_seen": 12057255, + "step": 565, + "time_per_iteration": 3.0515527725219727 + }, + { + "auxiliary_loss_clip": 0.01274531, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.07415497, + "balance_loss_mlp": 1.03561151, + "epoch": 0.06805747610172548, + "flos": 28256418990720.0, + "grad_norm": 2.4938047860429164, + "language_loss": 0.80811733, + "learning_rate": 3.984871036367492e-06, + "loss": 0.83135623, + "num_input_tokens_seen": 12077280, + "step": 566, + "time_per_iteration": 2.541935920715332 + }, + { + "auxiliary_loss_clip": 0.01254022, + "auxiliary_loss_mlp": 0.00766778, + "balance_loss_clip": 1.07211626, + "balance_loss_mlp": 1.00041664, + "epoch": 0.06817771899236458, + "flos": 20120533764480.0, + "grad_norm": 1.9718423772549256, + "language_loss": 0.83261657, + "learning_rate": 3.984775253787102e-06, + "loss": 0.85282451, + "num_input_tokens_seen": 12095570, + "step": 567, + "time_per_iteration": 2.543840169906616 + }, + { + "auxiliary_loss_clip": 0.01259013, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.0702976, + "balance_loss_mlp": 1.02662253, + "epoch": 0.06829796188300366, + "flos": 17930629284480.0, + "grad_norm": 2.9638526082362917, + "language_loss": 0.87789202, + "learning_rate": 3.984679170117885e-06, + "loss": 0.90088463, + "num_input_tokens_seen": 12111775, + "step": 568, + "time_per_iteration": 2.4911394119262695 + }, + { + "auxiliary_loss_clip": 0.01253339, + "auxiliary_loss_mlp": 0.01041753, + "balance_loss_clip": 1.06978703, + "balance_loss_mlp": 1.02701879, + "epoch": 0.06841820477364276, + "flos": 14501627285760.0, + "grad_norm": 2.599977270728226, + "language_loss": 0.78607631, + "learning_rate": 3.984582785374415e-06, + "loss": 0.80902725, + "num_input_tokens_seen": 12129215, + "step": 569, + "time_per_iteration": 2.512500524520874 + }, + { + "auxiliary_loss_clip": 0.01240847, + "auxiliary_loss_mlp": 0.00766995, + "balance_loss_clip": 1.07188869, + "balance_loss_mlp": 1.00038803, + "epoch": 0.06853844766428185, + "flos": 21938474954880.0, + "grad_norm": 2.106959947043695, + "language_loss": 0.80572426, + "learning_rate": 3.9844860995713155e-06, + "loss": 0.82580268, + "num_input_tokens_seen": 12148755, + "step": 570, + "time_per_iteration": 2.5807104110717773 + }, + { + "auxiliary_loss_clip": 0.01257449, + "auxiliary_loss_mlp": 0.01043734, + "balance_loss_clip": 1.07771003, + "balance_loss_mlp": 1.03032315, + "epoch": 0.06865869055492094, + "flos": 16800628348800.0, + "grad_norm": 2.3690490243003826, + "language_loss": 0.83040488, + "learning_rate": 3.9843891127232524e-06, + "loss": 0.8534168, + "num_input_tokens_seen": 12166290, + "step": 571, + "time_per_iteration": 2.5020804405212402 + }, + { + "auxiliary_loss_clip": 0.0119598, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.06299663, + "balance_loss_mlp": 1.02921748, + "epoch": 0.06877893344556003, + "flos": 19937281553280.0, + "grad_norm": 2.2342563898718804, + "language_loss": 0.66607964, + "learning_rate": 3.984291824844938e-06, + "loss": 0.68847883, + "num_input_tokens_seen": 12181385, + "step": 572, + "time_per_iteration": 2.6187117099761963 + }, + { + "auxiliary_loss_clip": 0.0127414, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.07464194, + "balance_loss_mlp": 1.02866173, + "epoch": 0.06889917633619912, + "flos": 23039388852480.0, + "grad_norm": 2.5559593446396294, + "language_loss": 0.85313123, + "learning_rate": 3.984194235951132e-06, + "loss": 0.87630028, + "num_input_tokens_seen": 12197530, + "step": 573, + "time_per_iteration": 2.500725746154785 + }, + { + "auxiliary_loss_clip": 0.01277281, + "auxiliary_loss_mlp": 0.01057823, + "balance_loss_clip": 1.07881522, + "balance_loss_mlp": 1.04467988, + "epoch": 0.06901941922683821, + "flos": 20960556203520.0, + "grad_norm": 2.6776634110256503, + "language_loss": 0.84668672, + "learning_rate": 3.9840963460566375e-06, + "loss": 0.87003773, + "num_input_tokens_seen": 12216310, + "step": 574, + "time_per_iteration": 2.4887170791625977 + }, + { + "auxiliary_loss_clip": 0.01180166, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.06158209, + "balance_loss_mlp": 1.02635908, + "epoch": 0.06913966211747731, + "flos": 24821850384000.0, + "grad_norm": 2.8095923957813396, + "language_loss": 0.89605409, + "learning_rate": 3.983998155176305e-06, + "loss": 0.91825771, + "num_input_tokens_seen": 12236670, + "step": 575, + "time_per_iteration": 2.8775432109832764 + }, + { + "auxiliary_loss_clip": 0.01160083, + "auxiliary_loss_mlp": 0.01004491, + "balance_loss_clip": 1.03653765, + "balance_loss_mlp": 0.99853033, + "epoch": 0.06925990500811639, + "flos": 58367446957440.0, + "grad_norm": 0.8249431166627818, + "language_loss": 0.57010722, + "learning_rate": 3.9838996633250305e-06, + "loss": 0.59175301, + "num_input_tokens_seen": 12297185, + "step": 576, + "time_per_iteration": 3.157111883163452 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.07015228, + "balance_loss_mlp": 1.03566158, + "epoch": 0.06938014789875549, + "flos": 12749940731520.0, + "grad_norm": 2.202914150551851, + "language_loss": 0.88080609, + "learning_rate": 3.983800870517753e-06, + "loss": 0.90384698, + "num_input_tokens_seen": 12313975, + "step": 577, + "time_per_iteration": 2.5130162239074707 + }, + { + "auxiliary_loss_clip": 0.01252016, + "auxiliary_loss_mlp": 0.01045524, + "balance_loss_clip": 1.07390738, + "balance_loss_mlp": 1.03321528, + "epoch": 0.06950039078939457, + "flos": 22820226019200.0, + "grad_norm": 2.539474317481431, + "language_loss": 0.78479326, + "learning_rate": 3.983701776769463e-06, + "loss": 0.80776858, + "num_input_tokens_seen": 12331385, + "step": 578, + "time_per_iteration": 2.5052263736724854 + }, + { + "auxiliary_loss_clip": 0.01250283, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.07302582, + "balance_loss_mlp": 1.02979124, + "epoch": 0.06962063368003367, + "flos": 21941348042880.0, + "grad_norm": 1.8980668948412203, + "language_loss": 0.85984695, + "learning_rate": 3.9836023820951885e-06, + "loss": 0.88278651, + "num_input_tokens_seen": 12350600, + "step": 579, + "time_per_iteration": 2.526423692703247 + }, + { + "auxiliary_loss_clip": 0.01216759, + "auxiliary_loss_mlp": 0.01049247, + "balance_loss_clip": 1.06326473, + "balance_loss_mlp": 1.03671849, + "epoch": 0.06974087657067275, + "flos": 20706021452160.0, + "grad_norm": 1.789721971271885, + "language_loss": 0.68311691, + "learning_rate": 3.983502686510011e-06, + "loss": 0.70577699, + "num_input_tokens_seen": 12371430, + "step": 580, + "time_per_iteration": 2.6056437492370605 + }, + { + "auxiliary_loss_clip": 0.01258208, + "auxiliary_loss_mlp": 0.00766454, + "balance_loss_clip": 1.06969035, + "balance_loss_mlp": 1.00043762, + "epoch": 0.06986111946131185, + "flos": 22638230784000.0, + "grad_norm": 2.542945349135428, + "language_loss": 0.73738778, + "learning_rate": 3.9834026900290525e-06, + "loss": 0.75763446, + "num_input_tokens_seen": 12390825, + "step": 581, + "time_per_iteration": 3.3825559616088867 + }, + { + "auxiliary_loss_clip": 0.01270935, + "auxiliary_loss_mlp": 0.01044763, + "balance_loss_clip": 1.07250535, + "balance_loss_mlp": 1.03175092, + "epoch": 0.06998136235195095, + "flos": 26943453152640.0, + "grad_norm": 2.0011242746998557, + "language_loss": 1.00358772, + "learning_rate": 3.983302392667482e-06, + "loss": 1.0267446, + "num_input_tokens_seen": 12411670, + "step": 582, + "time_per_iteration": 2.5829575061798096 + }, + { + "auxiliary_loss_clip": 0.01254151, + "auxiliary_loss_mlp": 0.01043282, + "balance_loss_clip": 1.07271314, + "balance_loss_mlp": 1.03006136, + "epoch": 0.07010160524259003, + "flos": 22492505306880.0, + "grad_norm": 1.8228352045585985, + "language_loss": 0.93882, + "learning_rate": 3.983201794440517e-06, + "loss": 0.96179426, + "num_input_tokens_seen": 12431245, + "step": 583, + "time_per_iteration": 2.4973533153533936 + }, + { + "auxiliary_loss_clip": 0.01227027, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.06721604, + "balance_loss_mlp": 1.02745128, + "epoch": 0.07022184813322913, + "flos": 18332541538560.0, + "grad_norm": 1.6934672667998625, + "language_loss": 0.67659652, + "learning_rate": 3.9831008953634165e-06, + "loss": 0.69927382, + "num_input_tokens_seen": 12450535, + "step": 584, + "time_per_iteration": 3.3078415393829346 + }, + { + "auxiliary_loss_clip": 0.01189039, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.05903637, + "balance_loss_mlp": 1.03095865, + "epoch": 0.07034209102386821, + "flos": 24675550289280.0, + "grad_norm": 2.2925334378134368, + "language_loss": 0.8127383, + "learning_rate": 3.9829996954514864e-06, + "loss": 0.83508384, + "num_input_tokens_seen": 12469675, + "step": 585, + "time_per_iteration": 2.6435909271240234 + }, + { + "auxiliary_loss_clip": 0.01244677, + "auxiliary_loss_mlp": 0.01047812, + "balance_loss_clip": 1.06925857, + "balance_loss_mlp": 1.03337574, + "epoch": 0.0704623339145073, + "flos": 25995878415360.0, + "grad_norm": 1.8228520911126989, + "language_loss": 0.84309483, + "learning_rate": 3.982898194720079e-06, + "loss": 0.86601979, + "num_input_tokens_seen": 12490405, + "step": 586, + "time_per_iteration": 3.4164116382598877 + }, + { + "auxiliary_loss_clip": 0.01236621, + "auxiliary_loss_mlp": 0.00767169, + "balance_loss_clip": 1.07208943, + "balance_loss_mlp": 1.00044227, + "epoch": 0.0705825768051464, + "flos": 25338318088320.0, + "grad_norm": 2.1328378285480496, + "language_loss": 0.82623172, + "learning_rate": 3.982796393184592e-06, + "loss": 0.84626967, + "num_input_tokens_seen": 12509485, + "step": 587, + "time_per_iteration": 3.375338554382324 + }, + { + "auxiliary_loss_clip": 0.0114264, + "auxiliary_loss_mlp": 0.01018892, + "balance_loss_clip": 1.03358495, + "balance_loss_mlp": 1.01264572, + "epoch": 0.07070281969578548, + "flos": 66047552507520.0, + "grad_norm": 0.7932328357860626, + "language_loss": 0.62630105, + "learning_rate": 3.98269429086047e-06, + "loss": 0.64791638, + "num_input_tokens_seen": 12567325, + "step": 588, + "time_per_iteration": 2.972320318222046 + }, + { + "auxiliary_loss_clip": 0.01227739, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_clip": 1.06882036, + "balance_loss_mlp": 1.03336143, + "epoch": 0.07082306258642458, + "flos": 23653568528640.0, + "grad_norm": 3.1409108826991807, + "language_loss": 0.8650409, + "learning_rate": 3.982591887763199e-06, + "loss": 0.887797, + "num_input_tokens_seen": 12584785, + "step": 589, + "time_per_iteration": 2.5690934658050537 + }, + { + "auxiliary_loss_clip": 0.01199029, + "auxiliary_loss_mlp": 0.01041068, + "balance_loss_clip": 1.05703413, + "balance_loss_mlp": 1.02635217, + "epoch": 0.07094330547706366, + "flos": 13880049408000.0, + "grad_norm": 2.259644323531196, + "language_loss": 0.81569409, + "learning_rate": 3.982489183908316e-06, + "loss": 0.83809513, + "num_input_tokens_seen": 12601205, + "step": 590, + "time_per_iteration": 2.539745807647705 + }, + { + "auxiliary_loss_clip": 0.01162121, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.05206561, + "balance_loss_mlp": 1.03160095, + "epoch": 0.07106354836770276, + "flos": 24645098534400.0, + "grad_norm": 2.361266753145197, + "language_loss": 0.84465516, + "learning_rate": 3.982386179311399e-06, + "loss": 0.86671686, + "num_input_tokens_seen": 12621725, + "step": 591, + "time_per_iteration": 2.6699905395507812 + }, + { + "auxiliary_loss_clip": 0.01258631, + "auxiliary_loss_mlp": 0.01047805, + "balance_loss_clip": 1.07177889, + "balance_loss_mlp": 1.03220034, + "epoch": 0.07118379125834184, + "flos": 16217223649920.0, + "grad_norm": 2.277611624363638, + "language_loss": 0.87452149, + "learning_rate": 3.982282873988075e-06, + "loss": 0.89758587, + "num_input_tokens_seen": 12639600, + "step": 592, + "time_per_iteration": 2.4876914024353027 + }, + { + "auxiliary_loss_clip": 0.01238607, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_clip": 1.07014549, + "balance_loss_mlp": 1.02961385, + "epoch": 0.07130403414898094, + "flos": 19719986227200.0, + "grad_norm": 1.6624626919673313, + "language_loss": 0.87099338, + "learning_rate": 3.982179267954016e-06, + "loss": 0.89380157, + "num_input_tokens_seen": 12660030, + "step": 593, + "time_per_iteration": 2.58138108253479 + }, + { + "auxiliary_loss_clip": 0.0126898, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.07175934, + "balance_loss_mlp": 1.02510643, + "epoch": 0.07142427703962004, + "flos": 21871933009920.0, + "grad_norm": 2.132711499028081, + "language_loss": 0.96150446, + "learning_rate": 3.982075361224937e-06, + "loss": 0.98458219, + "num_input_tokens_seen": 12678395, + "step": 594, + "time_per_iteration": 2.519207239151001 + }, + { + "auxiliary_loss_clip": 0.0124978, + "auxiliary_loss_mlp": 0.00766412, + "balance_loss_clip": 1.07181573, + "balance_loss_mlp": 1.00041747, + "epoch": 0.07154451993025912, + "flos": 18296595002880.0, + "grad_norm": 2.029373387923085, + "language_loss": 0.87810063, + "learning_rate": 3.981971153816602e-06, + "loss": 0.89826262, + "num_input_tokens_seen": 12696000, + "step": 595, + "time_per_iteration": 2.512521743774414 + }, + { + "auxiliary_loss_clip": 0.01269919, + "auxiliary_loss_mlp": 0.01041849, + "balance_loss_clip": 1.07637262, + "balance_loss_mlp": 1.02937961, + "epoch": 0.07166476282089822, + "flos": 22160690444160.0, + "grad_norm": 1.6195154270940286, + "language_loss": 0.96278137, + "learning_rate": 3.981866645744819e-06, + "loss": 0.98589909, + "num_input_tokens_seen": 12716715, + "step": 596, + "time_per_iteration": 2.5581467151641846 + }, + { + "auxiliary_loss_clip": 0.01272034, + "auxiliary_loss_mlp": 0.00767275, + "balance_loss_clip": 1.07455778, + "balance_loss_mlp": 1.00038826, + "epoch": 0.0717850057115373, + "flos": 14136343925760.0, + "grad_norm": 2.0429279469758113, + "language_loss": 0.81561875, + "learning_rate": 3.9817618370254416e-06, + "loss": 0.83601189, + "num_input_tokens_seen": 12733370, + "step": 597, + "time_per_iteration": 2.490833282470703 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01050715, + "balance_loss_clip": 1.07419133, + "balance_loss_mlp": 1.03690469, + "epoch": 0.0719052486021764, + "flos": 30917794412160.0, + "grad_norm": 2.035687491896235, + "language_loss": 0.87465435, + "learning_rate": 3.9816567276743684e-06, + "loss": 0.89787042, + "num_input_tokens_seen": 12753235, + "step": 598, + "time_per_iteration": 2.560598134994507 + }, + { + "auxiliary_loss_clip": 0.01233461, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.06928658, + "balance_loss_mlp": 1.02846587, + "epoch": 0.0720254914928155, + "flos": 21287019939840.0, + "grad_norm": 2.1583285288588976, + "language_loss": 0.77268213, + "learning_rate": 3.9815513177075466e-06, + "loss": 0.79543942, + "num_input_tokens_seen": 12772020, + "step": 599, + "time_per_iteration": 2.5347886085510254 + }, + { + "auxiliary_loss_clip": 0.01243331, + "auxiliary_loss_mlp": 0.01043086, + "balance_loss_clip": 1.06968951, + "balance_loss_mlp": 1.0309267, + "epoch": 0.07214573438345458, + "flos": 27819170732160.0, + "grad_norm": 1.6157611217976484, + "language_loss": 0.70185947, + "learning_rate": 3.9814456071409646e-06, + "loss": 0.72472358, + "num_input_tokens_seen": 12792555, + "step": 600, + "time_per_iteration": 2.6127421855926514 + }, + { + "auxiliary_loss_clip": 0.01207243, + "auxiliary_loss_mlp": 0.01054259, + "balance_loss_clip": 1.06441975, + "balance_loss_mlp": 1.03988266, + "epoch": 0.07226597727409367, + "flos": 25483576688640.0, + "grad_norm": 3.3255513682903772, + "language_loss": 0.85243708, + "learning_rate": 3.981339595990659e-06, + "loss": 0.87505209, + "num_input_tokens_seen": 12811085, + "step": 601, + "time_per_iteration": 2.6214487552642822 + }, + { + "auxiliary_loss_clip": 0.01252778, + "auxiliary_loss_mlp": 0.01051634, + "balance_loss_clip": 1.07165539, + "balance_loss_mlp": 1.03695917, + "epoch": 0.07238622016473276, + "flos": 23513840622720.0, + "grad_norm": 2.047304202507383, + "language_loss": 0.80958486, + "learning_rate": 3.981233284272713e-06, + "loss": 0.83262897, + "num_input_tokens_seen": 12830830, + "step": 602, + "time_per_iteration": 2.525503396987915 + }, + { + "auxiliary_loss_clip": 0.0122027, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.06605244, + "balance_loss_mlp": 1.03034663, + "epoch": 0.07250646305537185, + "flos": 25453519983360.0, + "grad_norm": 2.1507699758599914, + "language_loss": 0.90146458, + "learning_rate": 3.981126672003253e-06, + "loss": 0.92409182, + "num_input_tokens_seen": 12853505, + "step": 603, + "time_per_iteration": 2.6561989784240723 + }, + { + "auxiliary_loss_clip": 0.01239124, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.06450915, + "balance_loss_mlp": 1.03307962, + "epoch": 0.07262670594601094, + "flos": 27155038216320.0, + "grad_norm": 4.26484111265464, + "language_loss": 0.78068703, + "learning_rate": 3.981019759198451e-06, + "loss": 0.80353779, + "num_input_tokens_seen": 12872455, + "step": 604, + "time_per_iteration": 2.6214585304260254 + }, + { + "auxiliary_loss_clip": 0.012373, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.06864572, + "balance_loss_mlp": 1.03468108, + "epoch": 0.07274694883665003, + "flos": 26651607148800.0, + "grad_norm": 2.466745686178935, + "language_loss": 0.84249026, + "learning_rate": 3.980912545874528e-06, + "loss": 0.86534727, + "num_input_tokens_seen": 12892620, + "step": 605, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.01248823, + "auxiliary_loss_mlp": 0.007667, + "balance_loss_clip": 1.07069409, + "balance_loss_mlp": 1.00038338, + "epoch": 0.07286719172728913, + "flos": 29862344154240.0, + "grad_norm": 1.8941259392666263, + "language_loss": 0.85113156, + "learning_rate": 3.980805032047746e-06, + "loss": 0.87128687, + "num_input_tokens_seen": 12914090, + "step": 606, + "time_per_iteration": 2.5893375873565674 + }, + { + "auxiliary_loss_clip": 0.01230261, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_clip": 1.06495249, + "balance_loss_mlp": 1.02938533, + "epoch": 0.07298743461792821, + "flos": 17382057799680.0, + "grad_norm": 2.07065328759031, + "language_loss": 0.81085038, + "learning_rate": 3.980697217734415e-06, + "loss": 0.8336035, + "num_input_tokens_seen": 12931830, + "step": 607, + "time_per_iteration": 3.341517925262451 + }, + { + "auxiliary_loss_clip": 0.01208733, + "auxiliary_loss_mlp": 0.00766282, + "balance_loss_clip": 1.06585383, + "balance_loss_mlp": 1.00038433, + "epoch": 0.07310767750856731, + "flos": 19498201701120.0, + "grad_norm": 3.847156902932048, + "language_loss": 0.91369158, + "learning_rate": 3.980589102950891e-06, + "loss": 0.93344176, + "num_input_tokens_seen": 12949995, + "step": 608, + "time_per_iteration": 2.586209535598755 + }, + { + "auxiliary_loss_clip": 0.01237563, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_clip": 1.07175493, + "balance_loss_mlp": 1.03175354, + "epoch": 0.07322792039920639, + "flos": 29168693637120.0, + "grad_norm": 2.4805693067264927, + "language_loss": 0.76412082, + "learning_rate": 3.9804806877135755e-06, + "loss": 0.78695619, + "num_input_tokens_seen": 12968040, + "step": 609, + "time_per_iteration": 2.6467506885528564 + }, + { + "auxiliary_loss_clip": 0.01258454, + "auxiliary_loss_mlp": 0.00767288, + "balance_loss_clip": 1.07046413, + "balance_loss_mlp": 1.00043476, + "epoch": 0.07334816328984549, + "flos": 23477822259840.0, + "grad_norm": 2.431415471061829, + "language_loss": 0.85902739, + "learning_rate": 3.980371972038915e-06, + "loss": 0.87928486, + "num_input_tokens_seen": 12988530, + "step": 610, + "time_per_iteration": 2.5906574726104736 + }, + { + "auxiliary_loss_clip": 0.01270548, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_clip": 1.0740211, + "balance_loss_mlp": 1.03436267, + "epoch": 0.07346840618048459, + "flos": 22962467877120.0, + "grad_norm": 1.8416321857646867, + "language_loss": 0.84427935, + "learning_rate": 3.980262955943399e-06, + "loss": 0.867468, + "num_input_tokens_seen": 13008195, + "step": 611, + "time_per_iteration": 3.3923685550689697 + }, + { + "auxiliary_loss_clip": 0.01229167, + "auxiliary_loss_mlp": 0.01045747, + "balance_loss_clip": 1.07031846, + "balance_loss_mlp": 1.03315806, + "epoch": 0.07358864907112367, + "flos": 17673903803520.0, + "grad_norm": 2.529981481700748, + "language_loss": 0.86986494, + "learning_rate": 3.980153639443569e-06, + "loss": 0.89261407, + "num_input_tokens_seen": 13024180, + "step": 612, + "time_per_iteration": 3.392343759536743 + }, + { + "auxiliary_loss_clip": 0.01241957, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.07091379, + "balance_loss_mlp": 1.03235972, + "epoch": 0.07370889196176277, + "flos": 24097029840000.0, + "grad_norm": 2.0301275125406475, + "language_loss": 0.8016212, + "learning_rate": 3.980044022556005e-06, + "loss": 0.82449901, + "num_input_tokens_seen": 13043865, + "step": 613, + "time_per_iteration": 2.5738940238952637 + }, + { + "auxiliary_loss_clip": 0.01253469, + "auxiliary_loss_mlp": 0.01054136, + "balance_loss_clip": 1.07109725, + "balance_loss_mlp": 1.04085612, + "epoch": 0.07382913485240185, + "flos": 25885919905920.0, + "grad_norm": 2.2365504979392195, + "language_loss": 0.72833198, + "learning_rate": 3.9799341052973375e-06, + "loss": 0.75140798, + "num_input_tokens_seen": 13063700, + "step": 614, + "time_per_iteration": 3.3543448448181152 + }, + { + "auxiliary_loss_clip": 0.01238887, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_clip": 1.07326174, + "balance_loss_mlp": 1.02729702, + "epoch": 0.07394937774304094, + "flos": 16873850223360.0, + "grad_norm": 2.370760727714136, + "language_loss": 0.74629426, + "learning_rate": 3.979823887684241e-06, + "loss": 0.76910055, + "num_input_tokens_seen": 13082640, + "step": 615, + "time_per_iteration": 2.532843828201294 + }, + { + "auxiliary_loss_clip": 0.01268693, + "auxiliary_loss_mlp": 0.01048816, + "balance_loss_clip": 1.07419467, + "balance_loss_mlp": 1.03496361, + "epoch": 0.07406962063368003, + "flos": 20703471586560.0, + "grad_norm": 6.031963824455091, + "language_loss": 0.84616119, + "learning_rate": 3.979713369733434e-06, + "loss": 0.86933625, + "num_input_tokens_seen": 13100505, + "step": 616, + "time_per_iteration": 2.4737789630889893 + }, + { + "auxiliary_loss_clip": 0.01249145, + "auxiliary_loss_mlp": 0.01053868, + "balance_loss_clip": 1.07168019, + "balance_loss_mlp": 1.04010522, + "epoch": 0.07418986352431912, + "flos": 21430985650560.0, + "grad_norm": 2.1586248099889898, + "language_loss": 0.84475207, + "learning_rate": 3.979602551461683e-06, + "loss": 0.86778224, + "num_input_tokens_seen": 13121285, + "step": 617, + "time_per_iteration": 2.545459747314453 + }, + { + "auxiliary_loss_clip": 0.01232963, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.06911814, + "balance_loss_mlp": 1.03206229, + "epoch": 0.07431010641495822, + "flos": 12021133777920.0, + "grad_norm": 2.211790457161869, + "language_loss": 0.91610944, + "learning_rate": 3.979491432885799e-06, + "loss": 0.93889308, + "num_input_tokens_seen": 13137550, + "step": 618, + "time_per_iteration": 2.505262613296509 + }, + { + "auxiliary_loss_clip": 0.01199363, + "auxiliary_loss_mlp": 0.00766176, + "balance_loss_clip": 1.06141984, + "balance_loss_mlp": 1.00033629, + "epoch": 0.0744303493055973, + "flos": 20957575374720.0, + "grad_norm": 3.1984461305388243, + "language_loss": 0.83165497, + "learning_rate": 3.97938001402264e-06, + "loss": 0.85131025, + "num_input_tokens_seen": 13156675, + "step": 619, + "time_per_iteration": 2.5636367797851562 + }, + { + "auxiliary_loss_clip": 0.01214358, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_clip": 1.06799436, + "balance_loss_mlp": 1.0309844, + "epoch": 0.0745505921962364, + "flos": 16253134272000.0, + "grad_norm": 2.762055545839595, + "language_loss": 0.79720354, + "learning_rate": 3.979268294889105e-06, + "loss": 0.81978762, + "num_input_tokens_seen": 13172225, + "step": 620, + "time_per_iteration": 2.5558083057403564 + }, + { + "auxiliary_loss_clip": 0.01268026, + "auxiliary_loss_mlp": 0.01045921, + "balance_loss_clip": 1.07303298, + "balance_loss_mlp": 1.03282547, + "epoch": 0.07467083508687548, + "flos": 50944635550080.0, + "grad_norm": 1.8737530166268968, + "language_loss": 0.7385307, + "learning_rate": 3.979156275502143e-06, + "loss": 0.76167017, + "num_input_tokens_seen": 13195885, + "step": 621, + "time_per_iteration": 2.7360219955444336 + }, + { + "auxiliary_loss_clip": 0.0122435, + "auxiliary_loss_mlp": 0.01049868, + "balance_loss_clip": 1.06858814, + "balance_loss_mlp": 1.03562284, + "epoch": 0.07479107797751458, + "flos": 17529686697600.0, + "grad_norm": 2.2542555759247103, + "language_loss": 0.91447878, + "learning_rate": 3.979043955878749e-06, + "loss": 0.93722099, + "num_input_tokens_seen": 13213730, + "step": 622, + "time_per_iteration": 2.5534796714782715 + }, + { + "auxiliary_loss_clip": 0.0123374, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.06887007, + "balance_loss_mlp": 1.02829587, + "epoch": 0.07491132086815366, + "flos": 23473943591040.0, + "grad_norm": 2.323915351861144, + "language_loss": 0.83450806, + "learning_rate": 3.978931336035959e-06, + "loss": 0.85725796, + "num_input_tokens_seen": 13232540, + "step": 623, + "time_per_iteration": 2.5568387508392334 + }, + { + "auxiliary_loss_clip": 0.0125447, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_clip": 1.07419753, + "balance_loss_mlp": 1.03507972, + "epoch": 0.07503156375879276, + "flos": 20157557708160.0, + "grad_norm": 2.1762620619161925, + "language_loss": 0.82393086, + "learning_rate": 3.9788184159908595e-06, + "loss": 0.84696364, + "num_input_tokens_seen": 13249670, + "step": 624, + "time_per_iteration": 2.513934373855591 + }, + { + "auxiliary_loss_clip": 0.01230364, + "auxiliary_loss_mlp": 0.01049586, + "balance_loss_clip": 1.06845868, + "balance_loss_mlp": 1.03695536, + "epoch": 0.07515180664943186, + "flos": 15115519653120.0, + "grad_norm": 2.3595351152660653, + "language_loss": 0.81973308, + "learning_rate": 3.97870519576058e-06, + "loss": 0.84253263, + "num_input_tokens_seen": 13266095, + "step": 625, + "time_per_iteration": 2.5559334754943848 + }, + { + "auxiliary_loss_clip": 0.01218141, + "auxiliary_loss_mlp": 0.00767158, + "balance_loss_clip": 1.06659019, + "balance_loss_mlp": 1.0003103, + "epoch": 0.07527204954007094, + "flos": 21287702298240.0, + "grad_norm": 2.801635387591967, + "language_loss": 0.80702305, + "learning_rate": 3.978591675362295e-06, + "loss": 0.82687598, + "num_input_tokens_seen": 13284810, + "step": 626, + "time_per_iteration": 2.595956802368164 + }, + { + "auxiliary_loss_clip": 0.01201154, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_clip": 1.06840372, + "balance_loss_mlp": 1.03134155, + "epoch": 0.07539229243071004, + "flos": 21324187537920.0, + "grad_norm": 1.6730813388501025, + "language_loss": 0.87549746, + "learning_rate": 3.978477854813226e-06, + "loss": 0.89795494, + "num_input_tokens_seen": 13304150, + "step": 627, + "time_per_iteration": 2.600982666015625 + }, + { + "auxiliary_loss_clip": 0.01253248, + "auxiliary_loss_mlp": 0.01048684, + "balance_loss_clip": 1.07085347, + "balance_loss_mlp": 1.03615475, + "epoch": 0.07551253532134912, + "flos": 13042540920960.0, + "grad_norm": 1.9558530616412129, + "language_loss": 0.82543725, + "learning_rate": 3.97836373413064e-06, + "loss": 0.8484565, + "num_input_tokens_seen": 13322205, + "step": 628, + "time_per_iteration": 2.5177395343780518 + }, + { + "auxiliary_loss_clip": 0.01266088, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.0708847, + "balance_loss_mlp": 1.03028774, + "epoch": 0.07563277821198822, + "flos": 19208761908480.0, + "grad_norm": 1.9345176139540943, + "language_loss": 0.74608195, + "learning_rate": 3.978249313331848e-06, + "loss": 0.76918143, + "num_input_tokens_seen": 13340435, + "step": 629, + "time_per_iteration": 2.465623378753662 + }, + { + "auxiliary_loss_clip": 0.01258161, + "auxiliary_loss_mlp": 0.00766786, + "balance_loss_clip": 1.06967878, + "balance_loss_mlp": 1.00033998, + "epoch": 0.07575302110262731, + "flos": 19537200892800.0, + "grad_norm": 3.378119891573478, + "language_loss": 0.61919975, + "learning_rate": 3.978134592434208e-06, + "loss": 0.63944924, + "num_input_tokens_seen": 13358185, + "step": 630, + "time_per_iteration": 2.564579486846924 + }, + { + "auxiliary_loss_clip": 0.01114498, + "auxiliary_loss_mlp": 0.01025124, + "balance_loss_clip": 1.04549789, + "balance_loss_mlp": 1.01830506, + "epoch": 0.0758732639932664, + "flos": 67961808017280.0, + "grad_norm": 1.0213358593015807, + "language_loss": 0.59412086, + "learning_rate": 3.978019571455123e-06, + "loss": 0.61551702, + "num_input_tokens_seen": 13410130, + "step": 631, + "time_per_iteration": 3.167510509490967 + }, + { + "auxiliary_loss_clip": 0.01265159, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_clip": 1.07312822, + "balance_loss_mlp": 1.0312674, + "epoch": 0.07599350688390549, + "flos": 18989204025600.0, + "grad_norm": 2.048854635764491, + "language_loss": 0.84041762, + "learning_rate": 3.977904250412042e-06, + "loss": 0.86350322, + "num_input_tokens_seen": 13429085, + "step": 632, + "time_per_iteration": 2.4932351112365723 + }, + { + "auxiliary_loss_clip": 0.0121114, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_clip": 1.06574404, + "balance_loss_mlp": 1.03105998, + "epoch": 0.07611374977454458, + "flos": 21069006341760.0, + "grad_norm": 2.2991946161603662, + "language_loss": 0.85610855, + "learning_rate": 3.97778862932246e-06, + "loss": 0.87865913, + "num_input_tokens_seen": 13446250, + "step": 633, + "time_per_iteration": 2.5709784030914307 + }, + { + "auxiliary_loss_clip": 0.01163959, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.05389297, + "balance_loss_mlp": 1.02615809, + "epoch": 0.07623399266518367, + "flos": 18514536773760.0, + "grad_norm": 2.2726817816136817, + "language_loss": 0.94089937, + "learning_rate": 3.9776727082039144e-06, + "loss": 0.96293008, + "num_input_tokens_seen": 13463220, + "step": 634, + "time_per_iteration": 3.648066520690918 + }, + { + "auxiliary_loss_clip": 0.01176437, + "auxiliary_loss_mlp": 0.01020059, + "balance_loss_clip": 1.05035758, + "balance_loss_mlp": 1.01281059, + "epoch": 0.07635423555582276, + "flos": 44663036077440.0, + "grad_norm": 0.82104065496255, + "language_loss": 0.55479491, + "learning_rate": 3.977556487073991e-06, + "loss": 0.57675982, + "num_input_tokens_seen": 13517775, + "step": 635, + "time_per_iteration": 3.402374505996704 + }, + { + "auxiliary_loss_clip": 0.01223577, + "auxiliary_loss_mlp": 0.0104852, + "balance_loss_clip": 1.06167388, + "balance_loss_mlp": 1.03644371, + "epoch": 0.07647447844646185, + "flos": 21761148487680.0, + "grad_norm": 1.946885550848919, + "language_loss": 0.81662524, + "learning_rate": 3.97743996595032e-06, + "loss": 0.83934617, + "num_input_tokens_seen": 13537815, + "step": 636, + "time_per_iteration": 2.5996227264404297 + }, + { + "auxiliary_loss_clip": 0.01265604, + "auxiliary_loss_mlp": 0.01049177, + "balance_loss_clip": 1.0724721, + "balance_loss_mlp": 1.03498518, + "epoch": 0.07659472133710095, + "flos": 23806799948160.0, + "grad_norm": 1.6803141202601504, + "language_loss": 0.8170054, + "learning_rate": 3.9773231448505804e-06, + "loss": 0.84015316, + "num_input_tokens_seen": 13559605, + "step": 637, + "time_per_iteration": 2.537360191345215 + }, + { + "auxiliary_loss_clip": 0.01230394, + "auxiliary_loss_mlp": 0.0076715, + "balance_loss_clip": 1.06851327, + "balance_loss_mlp": 1.00029206, + "epoch": 0.07671496422774003, + "flos": 21469984842240.0, + "grad_norm": 2.2124122697110415, + "language_loss": 0.78051829, + "learning_rate": 3.977206023792491e-06, + "loss": 0.80049372, + "num_input_tokens_seen": 13579495, + "step": 638, + "time_per_iteration": 3.310861349105835 + }, + { + "auxiliary_loss_clip": 0.0125072, + "auxiliary_loss_mlp": 0.01058308, + "balance_loss_clip": 1.07406616, + "balance_loss_mlp": 1.04571342, + "epoch": 0.07683520711837913, + "flos": 16980971558400.0, + "grad_norm": 2.2335488895141045, + "language_loss": 0.81255984, + "learning_rate": 3.97708860279382e-06, + "loss": 0.83565015, + "num_input_tokens_seen": 13597605, + "step": 639, + "time_per_iteration": 3.3759865760803223 + }, + { + "auxiliary_loss_clip": 0.01214848, + "auxiliary_loss_mlp": 0.01053252, + "balance_loss_clip": 1.06397736, + "balance_loss_mlp": 1.03928065, + "epoch": 0.07695545000901821, + "flos": 23476744851840.0, + "grad_norm": 1.8175020274815743, + "language_loss": 0.78140259, + "learning_rate": 3.97697088187238e-06, + "loss": 0.80408359, + "num_input_tokens_seen": 13618120, + "step": 640, + "time_per_iteration": 2.6446046829223633 + }, + { + "auxiliary_loss_clip": 0.01230928, + "auxiliary_loss_mlp": 0.01050001, + "balance_loss_clip": 1.07108378, + "balance_loss_mlp": 1.03768706, + "epoch": 0.07707569289965731, + "flos": 17634258167040.0, + "grad_norm": 2.831044167807043, + "language_loss": 0.92103654, + "learning_rate": 3.976852861046029e-06, + "loss": 0.94384587, + "num_input_tokens_seen": 13634735, + "step": 641, + "time_per_iteration": 3.3709909915924072 + }, + { + "auxiliary_loss_clip": 0.01201538, + "auxiliary_loss_mlp": 0.01045002, + "balance_loss_clip": 1.06465745, + "balance_loss_mlp": 1.03225219, + "epoch": 0.0771959357902964, + "flos": 25775674087680.0, + "grad_norm": 1.579703201189408, + "language_loss": 0.80200726, + "learning_rate": 3.97673454033267e-06, + "loss": 0.82447267, + "num_input_tokens_seen": 13656835, + "step": 642, + "time_per_iteration": 2.7125182151794434 + }, + { + "auxiliary_loss_clip": 0.01229893, + "auxiliary_loss_mlp": 0.01050755, + "balance_loss_clip": 1.06527793, + "balance_loss_mlp": 1.03763044, + "epoch": 0.07731617868093549, + "flos": 19828651847040.0, + "grad_norm": 1.9451512243597169, + "language_loss": 0.82321537, + "learning_rate": 3.976615919750254e-06, + "loss": 0.84602189, + "num_input_tokens_seen": 13674535, + "step": 643, + "time_per_iteration": 2.5751240253448486 + }, + { + "auxiliary_loss_clip": 0.01246034, + "auxiliary_loss_mlp": 0.01049205, + "balance_loss_clip": 1.06955707, + "balance_loss_mlp": 1.03494167, + "epoch": 0.07743642157157458, + "flos": 21324654414720.0, + "grad_norm": 2.091017236888172, + "language_loss": 0.86677492, + "learning_rate": 3.976496999316775e-06, + "loss": 0.88972723, + "num_input_tokens_seen": 13693290, + "step": 644, + "time_per_iteration": 2.5353384017944336 + }, + { + "auxiliary_loss_clip": 0.01229732, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.07174492, + "balance_loss_mlp": 1.03572929, + "epoch": 0.07755666446221367, + "flos": 19969133938560.0, + "grad_norm": 2.1687205140379144, + "language_loss": 0.84138197, + "learning_rate": 3.976377779050271e-06, + "loss": 0.86417365, + "num_input_tokens_seen": 13711420, + "step": 645, + "time_per_iteration": 2.529776096343994 + }, + { + "auxiliary_loss_clip": 0.01238387, + "auxiliary_loss_mlp": 0.01052681, + "balance_loss_clip": 1.06626809, + "balance_loss_mlp": 1.03955626, + "epoch": 0.07767690735285276, + "flos": 23623224514560.0, + "grad_norm": 2.204351951587358, + "language_loss": 0.84668046, + "learning_rate": 3.976258258968831e-06, + "loss": 0.86959118, + "num_input_tokens_seen": 13729965, + "step": 646, + "time_per_iteration": 2.5297060012817383 + }, + { + "auxiliary_loss_clip": 0.01214031, + "auxiliary_loss_mlp": 0.01049297, + "balance_loss_clip": 1.06744981, + "balance_loss_mlp": 1.03694677, + "epoch": 0.07779715024349185, + "flos": 22236246702720.0, + "grad_norm": 2.2872897961186633, + "language_loss": 0.7417531, + "learning_rate": 3.976138439090583e-06, + "loss": 0.76438642, + "num_input_tokens_seen": 13748045, + "step": 647, + "time_per_iteration": 2.563839912414551 + }, + { + "auxiliary_loss_clip": 0.01218976, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.06795955, + "balance_loss_mlp": 1.02790403, + "epoch": 0.07791739313413094, + "flos": 20955097336320.0, + "grad_norm": 2.080131721883903, + "language_loss": 0.85294449, + "learning_rate": 3.976018319433706e-06, + "loss": 0.87555039, + "num_input_tokens_seen": 13765590, + "step": 648, + "time_per_iteration": 2.6100010871887207 + }, + { + "auxiliary_loss_clip": 0.01244642, + "auxiliary_loss_mlp": 0.01045595, + "balance_loss_clip": 1.06948924, + "balance_loss_mlp": 1.03258371, + "epoch": 0.07803763602477004, + "flos": 19312327797120.0, + "grad_norm": 2.919402372990991, + "language_loss": 0.90911055, + "learning_rate": 3.9758979000164205e-06, + "loss": 0.93201292, + "num_input_tokens_seen": 13782410, + "step": 649, + "time_per_iteration": 2.493290424346924 + }, + { + "auxiliary_loss_clip": 0.01222736, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.06792569, + "balance_loss_mlp": 1.02435088, + "epoch": 0.07815787891540912, + "flos": 22710806213760.0, + "grad_norm": 1.9905593002458517, + "language_loss": 0.71715081, + "learning_rate": 3.975777180856995e-06, + "loss": 0.73976272, + "num_input_tokens_seen": 13801530, + "step": 650, + "time_per_iteration": 2.5844192504882812 + }, + { + "auxiliary_loss_clip": 0.01269164, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_clip": 1.07210231, + "balance_loss_mlp": 1.03818274, + "epoch": 0.07827812180604822, + "flos": 22711129436160.0, + "grad_norm": 1.9965005916549772, + "language_loss": 0.86122298, + "learning_rate": 3.975656161973742e-06, + "loss": 0.88443708, + "num_input_tokens_seen": 13820615, + "step": 651, + "time_per_iteration": 2.4871466159820557 + }, + { + "auxiliary_loss_clip": 0.01266259, + "auxiliary_loss_mlp": 0.01048124, + "balance_loss_clip": 1.0703311, + "balance_loss_mlp": 1.03427768, + "epoch": 0.0783983646966873, + "flos": 21725597001600.0, + "grad_norm": 2.5839022706032084, + "language_loss": 0.89153659, + "learning_rate": 3.9755348433850194e-06, + "loss": 0.91468048, + "num_input_tokens_seen": 13835955, + "step": 652, + "time_per_iteration": 2.4620301723480225 + }, + { + "auxiliary_loss_clip": 0.01129248, + "auxiliary_loss_mlp": 0.0102131, + "balance_loss_clip": 1.03415215, + "balance_loss_mlp": 1.01501548, + "epoch": 0.0785186075873264, + "flos": 60640877537280.0, + "grad_norm": 0.9887619338597119, + "language_loss": 0.63607579, + "learning_rate": 3.975413225109232e-06, + "loss": 0.65758133, + "num_input_tokens_seen": 13896505, + "step": 653, + "time_per_iteration": 3.128957748413086 + }, + { + "auxiliary_loss_clip": 0.01247449, + "auxiliary_loss_mlp": 0.01043969, + "balance_loss_clip": 1.06966519, + "balance_loss_mlp": 1.03072464, + "epoch": 0.0786388504779655, + "flos": 23877902920320.0, + "grad_norm": 3.090177528024159, + "language_loss": 0.93495977, + "learning_rate": 3.975291307164829e-06, + "loss": 0.95787394, + "num_input_tokens_seen": 13915150, + "step": 654, + "time_per_iteration": 2.518588066101074 + }, + { + "auxiliary_loss_clip": 0.01203009, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.06245482, + "balance_loss_mlp": 1.03022122, + "epoch": 0.07875909336860458, + "flos": 15158684822400.0, + "grad_norm": 2.3712801821022276, + "language_loss": 0.85193503, + "learning_rate": 3.975169089570306e-06, + "loss": 0.8743884, + "num_input_tokens_seen": 13933525, + "step": 655, + "time_per_iteration": 2.5760207176208496 + }, + { + "auxiliary_loss_clip": 0.01231263, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_clip": 1.06589484, + "balance_loss_mlp": 1.03122044, + "epoch": 0.07887933625924368, + "flos": 22236857233920.0, + "grad_norm": 2.017975083997081, + "language_loss": 0.91828609, + "learning_rate": 3.975046572344202e-06, + "loss": 0.94104385, + "num_input_tokens_seen": 13949985, + "step": 656, + "time_per_iteration": 2.5114076137542725 + }, + { + "auxiliary_loss_clip": 0.01208942, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.06128097, + "balance_loss_mlp": 1.0372932, + "epoch": 0.07899957914988276, + "flos": 20777734955520.0, + "grad_norm": 1.9462314733049724, + "language_loss": 0.71072853, + "learning_rate": 3.974923755505103e-06, + "loss": 0.73332226, + "num_input_tokens_seen": 13969215, + "step": 657, + "time_per_iteration": 2.5772500038146973 + }, + { + "auxiliary_loss_clip": 0.01204601, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.06278753, + "balance_loss_mlp": 1.03082943, + "epoch": 0.07911982204052186, + "flos": 23003047267200.0, + "grad_norm": 1.7194952265079781, + "language_loss": 0.91309035, + "learning_rate": 3.974800639071641e-06, + "loss": 0.93557405, + "num_input_tokens_seen": 13989935, + "step": 658, + "time_per_iteration": 2.632525682449341 + }, + { + "auxiliary_loss_clip": 0.01170711, + "auxiliary_loss_mlp": 0.0076629, + "balance_loss_clip": 1.05856371, + "balance_loss_mlp": 1.00033998, + "epoch": 0.07924006493116094, + "flos": 23111389664640.0, + "grad_norm": 2.136139036446059, + "language_loss": 1.00600171, + "learning_rate": 3.974677223062492e-06, + "loss": 1.02537167, + "num_input_tokens_seen": 14007150, + "step": 659, + "time_per_iteration": 2.723534107208252 + }, + { + "auxiliary_loss_clip": 0.0122892, + "auxiliary_loss_mlp": 0.01038667, + "balance_loss_clip": 1.06812882, + "balance_loss_mlp": 1.02604234, + "epoch": 0.07936030782180004, + "flos": 16472153450880.0, + "grad_norm": 1.9440289711396954, + "language_loss": 0.74405819, + "learning_rate": 3.974553507496378e-06, + "loss": 0.76673406, + "num_input_tokens_seen": 14025725, + "step": 660, + "time_per_iteration": 3.3443922996520996 + }, + { + "auxiliary_loss_clip": 0.01222159, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0670166, + "balance_loss_mlp": 1.02789712, + "epoch": 0.07948055071243913, + "flos": 23733290764800.0, + "grad_norm": 2.3643717075132424, + "language_loss": 0.88975132, + "learning_rate": 3.974429492392068e-06, + "loss": 0.91240096, + "num_input_tokens_seen": 14045750, + "step": 661, + "time_per_iteration": 2.5975606441497803 + }, + { + "auxiliary_loss_clip": 0.01260346, + "auxiliary_loss_mlp": 0.00766076, + "balance_loss_clip": 1.07208467, + "balance_loss_mlp": 1.0003016, + "epoch": 0.07960079360307822, + "flos": 19573326996480.0, + "grad_norm": 2.0034913793044646, + "language_loss": 0.91186309, + "learning_rate": 3.974305177768373e-06, + "loss": 0.93212724, + "num_input_tokens_seen": 14063960, + "step": 662, + "time_per_iteration": 2.5311028957366943 + }, + { + "auxiliary_loss_clip": 0.0120549, + "auxiliary_loss_mlp": 0.01049673, + "balance_loss_clip": 1.06485152, + "balance_loss_mlp": 1.03619099, + "epoch": 0.07972103649371731, + "flos": 23513409659520.0, + "grad_norm": 3.2858681107034564, + "language_loss": 0.86359072, + "learning_rate": 3.974180563644152e-06, + "loss": 0.88614237, + "num_input_tokens_seen": 14082525, + "step": 663, + "time_per_iteration": 2.5958173274993896 + }, + { + "auxiliary_loss_clip": 0.01234112, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_clip": 1.06768823, + "balance_loss_mlp": 1.03302395, + "epoch": 0.0798412793843564, + "flos": 16726867770240.0, + "grad_norm": 2.529548067039062, + "language_loss": 0.89450055, + "learning_rate": 3.97405565003831e-06, + "loss": 0.91730046, + "num_input_tokens_seen": 14098610, + "step": 664, + "time_per_iteration": 3.256565570831299 + }, + { + "auxiliary_loss_clip": 0.01213461, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.06377316, + "balance_loss_mlp": 1.02453303, + "epoch": 0.07996152227499549, + "flos": 18223337214720.0, + "grad_norm": 1.9961695254687806, + "language_loss": 0.78604126, + "learning_rate": 3.973930436969794e-06, + "loss": 0.80855048, + "num_input_tokens_seen": 14117065, + "step": 665, + "time_per_iteration": 2.5939548015594482 + }, + { + "auxiliary_loss_clip": 0.01220187, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.06355524, + "balance_loss_mlp": 1.03709602, + "epoch": 0.08008176516563459, + "flos": 20594877793920.0, + "grad_norm": 1.9619162619694925, + "language_loss": 0.85930985, + "learning_rate": 3.973804924457602e-06, + "loss": 0.88201445, + "num_input_tokens_seen": 14135145, + "step": 666, + "time_per_iteration": 3.353271484375 + }, + { + "auxiliary_loss_clip": 0.01221835, + "auxiliary_loss_mlp": 0.01058227, + "balance_loss_clip": 1.06583881, + "balance_loss_mlp": 1.04523349, + "epoch": 0.08020200805627367, + "flos": 31834306863360.0, + "grad_norm": 1.9799605433536214, + "language_loss": 0.85940409, + "learning_rate": 3.973679112520771e-06, + "loss": 0.88220471, + "num_input_tokens_seen": 14156860, + "step": 667, + "time_per_iteration": 2.6472995281219482 + }, + { + "auxiliary_loss_clip": 0.01203249, + "auxiliary_loss_mlp": 0.01038871, + "balance_loss_clip": 1.06058013, + "balance_loss_mlp": 1.02604365, + "epoch": 0.08032225094691277, + "flos": 17783503176960.0, + "grad_norm": 1.9834034320635612, + "language_loss": 0.99088603, + "learning_rate": 3.973553001178389e-06, + "loss": 1.01330733, + "num_input_tokens_seen": 14174365, + "step": 668, + "time_per_iteration": 3.356459379196167 + }, + { + "auxiliary_loss_clip": 0.01215875, + "auxiliary_loss_mlp": 0.01037173, + "balance_loss_clip": 1.06615376, + "balance_loss_mlp": 1.02455425, + "epoch": 0.08044249383755185, + "flos": 24061693835520.0, + "grad_norm": 2.304705174710886, + "language_loss": 0.75877726, + "learning_rate": 3.973426590449585e-06, + "loss": 0.7813077, + "num_input_tokens_seen": 14192320, + "step": 669, + "time_per_iteration": 2.5939626693725586 + }, + { + "auxiliary_loss_clip": 0.01198574, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.06314945, + "balance_loss_mlp": 1.02717161, + "epoch": 0.08056273672819095, + "flos": 18223624523520.0, + "grad_norm": 1.8725881784779344, + "language_loss": 0.75441134, + "learning_rate": 3.9732998803535364e-06, + "loss": 0.77679622, + "num_input_tokens_seen": 14210380, + "step": 670, + "time_per_iteration": 2.5770716667175293 + }, + { + "auxiliary_loss_clip": 0.01262434, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.0697782, + "balance_loss_mlp": 1.03130746, + "epoch": 0.08068297961883003, + "flos": 19676856971520.0, + "grad_norm": 2.540563615216749, + "language_loss": 0.85565472, + "learning_rate": 3.973172870909465e-06, + "loss": 0.87872052, + "num_input_tokens_seen": 14225145, + "step": 671, + "time_per_iteration": 2.4702680110931396 + }, + { + "auxiliary_loss_clip": 0.01235763, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.06584334, + "balance_loss_mlp": 1.02686596, + "epoch": 0.08080322250946913, + "flos": 23148736830720.0, + "grad_norm": 2.9818086536384993, + "language_loss": 0.80207551, + "learning_rate": 3.973045562136638e-06, + "loss": 0.82483977, + "num_input_tokens_seen": 14241960, + "step": 672, + "time_per_iteration": 2.5467593669891357 + }, + { + "auxiliary_loss_clip": 0.01249875, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.06946349, + "balance_loss_mlp": 1.02912903, + "epoch": 0.08092346540010822, + "flos": 21763626526080.0, + "grad_norm": 2.2014684530543898, + "language_loss": 0.91550684, + "learning_rate": 3.972917954054368e-06, + "loss": 0.93842345, + "num_input_tokens_seen": 14260515, + "step": 673, + "time_per_iteration": 2.5179567337036133 + }, + { + "auxiliary_loss_clip": 0.01229938, + "auxiliary_loss_mlp": 0.01047212, + "balance_loss_clip": 1.07049966, + "balance_loss_mlp": 1.03244793, + "epoch": 0.08104370829074731, + "flos": 21032485188480.0, + "grad_norm": 2.1421099713565432, + "language_loss": 0.81740284, + "learning_rate": 3.972790046682013e-06, + "loss": 0.84017432, + "num_input_tokens_seen": 14279190, + "step": 674, + "time_per_iteration": 2.5228404998779297 + }, + { + "auxiliary_loss_clip": 0.01214516, + "auxiliary_loss_mlp": 0.01041553, + "balance_loss_clip": 1.06217706, + "balance_loss_mlp": 1.02849913, + "epoch": 0.0811639511813864, + "flos": 20083186598400.0, + "grad_norm": 1.803704715733776, + "language_loss": 0.78986847, + "learning_rate": 3.972661840038977e-06, + "loss": 0.81242919, + "num_input_tokens_seen": 14299480, + "step": 675, + "time_per_iteration": 2.58609676361084 + }, + { + "auxiliary_loss_clip": 0.01248848, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.07150388, + "balance_loss_mlp": 1.02758765, + "epoch": 0.08128419407202549, + "flos": 16836718538880.0, + "grad_norm": 2.1729115301359716, + "language_loss": 0.83291638, + "learning_rate": 3.972533334144707e-06, + "loss": 0.85580635, + "num_input_tokens_seen": 14316405, + "step": 676, + "time_per_iteration": 2.4742016792297363 + }, + { + "auxiliary_loss_clip": 0.01250297, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.06717348, + "balance_loss_mlp": 1.03067183, + "epoch": 0.08140443696266458, + "flos": 23769273214080.0, + "grad_norm": 2.422506621434802, + "language_loss": 0.783234, + "learning_rate": 3.972404529018699e-06, + "loss": 0.80617571, + "num_input_tokens_seen": 14336265, + "step": 677, + "time_per_iteration": 2.538540840148926 + }, + { + "auxiliary_loss_clip": 0.01223953, + "auxiliary_loss_mlp": 0.01035828, + "balance_loss_clip": 1.06134868, + "balance_loss_mlp": 1.02343643, + "epoch": 0.08152467985330367, + "flos": 24390132819840.0, + "grad_norm": 1.984735276641255, + "language_loss": 0.85500354, + "learning_rate": 3.972275424680493e-06, + "loss": 0.87760133, + "num_input_tokens_seen": 14356375, + "step": 678, + "time_per_iteration": 2.5549771785736084 + }, + { + "auxiliary_loss_clip": 0.01259167, + "auxiliary_loss_mlp": 0.01037917, + "balance_loss_clip": 1.06892157, + "balance_loss_mlp": 1.0253942, + "epoch": 0.08164492274394276, + "flos": 19317750750720.0, + "grad_norm": 2.101815434680312, + "language_loss": 0.91848153, + "learning_rate": 3.972146021149673e-06, + "loss": 0.94145238, + "num_input_tokens_seen": 14374650, + "step": 679, + "time_per_iteration": 2.4407427310943604 + }, + { + "auxiliary_loss_clip": 0.01213828, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.06540799, + "balance_loss_mlp": 1.02911127, + "epoch": 0.08176516563458186, + "flos": 14830461319680.0, + "grad_norm": 3.178083159867089, + "language_loss": 0.7836771, + "learning_rate": 3.972016318445868e-06, + "loss": 0.80622035, + "num_input_tokens_seen": 14392650, + "step": 680, + "time_per_iteration": 2.5333454608917236 + }, + { + "auxiliary_loss_clip": 0.01243865, + "auxiliary_loss_mlp": 0.01046605, + "balance_loss_clip": 1.0676018, + "balance_loss_mlp": 1.03423095, + "epoch": 0.08188540852522094, + "flos": 22602320161920.0, + "grad_norm": 2.239080391683848, + "language_loss": 0.92591882, + "learning_rate": 3.971886316588757e-06, + "loss": 0.94882351, + "num_input_tokens_seen": 14413155, + "step": 681, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01204447, + "auxiliary_loss_mlp": 0.01049683, + "balance_loss_clip": 1.06499052, + "balance_loss_mlp": 1.03581882, + "epoch": 0.08200565141586004, + "flos": 19463727623040.0, + "grad_norm": 2.4840795439774612, + "language_loss": 0.73761892, + "learning_rate": 3.9717560155980595e-06, + "loss": 0.76016021, + "num_input_tokens_seen": 14428805, + "step": 682, + "time_per_iteration": 2.557297706604004 + }, + { + "auxiliary_loss_clip": 0.01245373, + "auxiliary_loss_mlp": 0.01045271, + "balance_loss_clip": 1.06866574, + "balance_loss_mlp": 1.03249168, + "epoch": 0.08212589430649912, + "flos": 20594662312320.0, + "grad_norm": 2.6635650868617953, + "language_loss": 0.92139184, + "learning_rate": 3.971625415493542e-06, + "loss": 0.94429833, + "num_input_tokens_seen": 14447125, + "step": 683, + "time_per_iteration": 2.495816469192505 + }, + { + "auxiliary_loss_clip": 0.0120769, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.06395745, + "balance_loss_mlp": 1.02907932, + "epoch": 0.08224613719713822, + "flos": 25953611086080.0, + "grad_norm": 1.8735624448812276, + "language_loss": 0.87464941, + "learning_rate": 3.971494516295017e-06, + "loss": 0.89714932, + "num_input_tokens_seen": 14466575, + "step": 684, + "time_per_iteration": 2.5975239276885986 + }, + { + "auxiliary_loss_clip": 0.01216489, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.06358957, + "balance_loss_mlp": 1.03220904, + "epoch": 0.08236638008777732, + "flos": 23768734510080.0, + "grad_norm": 1.977049678289977, + "language_loss": 0.85380328, + "learning_rate": 3.971363318022341e-06, + "loss": 0.87642121, + "num_input_tokens_seen": 14487915, + "step": 685, + "time_per_iteration": 2.5909695625305176 + }, + { + "auxiliary_loss_clip": 0.0123084, + "auxiliary_loss_mlp": 0.01047554, + "balance_loss_clip": 1.06277061, + "balance_loss_mlp": 1.03398764, + "epoch": 0.0824866229784164, + "flos": 38799144887040.0, + "grad_norm": 2.320995925729637, + "language_loss": 0.68715346, + "learning_rate": 3.971231820695417e-06, + "loss": 0.70993733, + "num_input_tokens_seen": 14511530, + "step": 686, + "time_per_iteration": 2.6729483604431152 + }, + { + "auxiliary_loss_clip": 0.01236806, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.06897497, + "balance_loss_mlp": 1.03226888, + "epoch": 0.0826068658690555, + "flos": 23107762391040.0, + "grad_norm": 1.7689594334367413, + "language_loss": 0.81249619, + "learning_rate": 3.971100024334193e-06, + "loss": 0.83531773, + "num_input_tokens_seen": 14529050, + "step": 687, + "time_per_iteration": 3.4973368644714355 + }, + { + "auxiliary_loss_clip": 0.01195503, + "auxiliary_loss_mlp": 0.01045408, + "balance_loss_clip": 1.05743408, + "balance_loss_mlp": 1.03322434, + "epoch": 0.08272710875969458, + "flos": 21136374299520.0, + "grad_norm": 2.556138440075114, + "language_loss": 0.86235952, + "learning_rate": 3.970967928958663e-06, + "loss": 0.88476861, + "num_input_tokens_seen": 14546165, + "step": 688, + "time_per_iteration": 2.646243095397949 + }, + { + "auxiliary_loss_clip": 0.01202286, + "auxiliary_loss_mlp": 0.01048323, + "balance_loss_clip": 1.06436276, + "balance_loss_mlp": 1.03584182, + "epoch": 0.08284735165033368, + "flos": 19063000517760.0, + "grad_norm": 1.6641429196113793, + "language_loss": 0.83409357, + "learning_rate": 3.970835534588865e-06, + "loss": 0.85659969, + "num_input_tokens_seen": 14563660, + "step": 689, + "time_per_iteration": 2.590881109237671 + }, + { + "auxiliary_loss_clip": 0.01231876, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_clip": 1.06994677, + "balance_loss_mlp": 1.03721166, + "epoch": 0.08296759454097276, + "flos": 16727442387840.0, + "grad_norm": 1.9299228935292398, + "language_loss": 0.85633647, + "learning_rate": 3.970702841244883e-06, + "loss": 0.87914944, + "num_input_tokens_seen": 14581980, + "step": 690, + "time_per_iteration": 2.5300991535186768 + }, + { + "auxiliary_loss_clip": 0.01249433, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.07060182, + "balance_loss_mlp": 1.03250003, + "epoch": 0.08308783743161186, + "flos": 18004928567040.0, + "grad_norm": 2.3234322801547704, + "language_loss": 0.82538581, + "learning_rate": 3.970569848946847e-06, + "loss": 0.84832823, + "num_input_tokens_seen": 14601795, + "step": 691, + "time_per_iteration": 3.330634355545044 + }, + { + "auxiliary_loss_clip": 0.01230133, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.06582785, + "balance_loss_mlp": 1.02703404, + "epoch": 0.08320808032225095, + "flos": 15079788599040.0, + "grad_norm": 2.413250600106508, + "language_loss": 0.82537591, + "learning_rate": 3.970436557714932e-06, + "loss": 0.8480705, + "num_input_tokens_seen": 14618315, + "step": 692, + "time_per_iteration": 2.54731822013855 + }, + { + "auxiliary_loss_clip": 0.01222118, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.06346726, + "balance_loss_mlp": 1.02607346, + "epoch": 0.08332832321289003, + "flos": 22383085501440.0, + "grad_norm": 8.100572751541788, + "language_loss": 0.86280978, + "learning_rate": 3.970302967569358e-06, + "loss": 0.88542175, + "num_input_tokens_seen": 14636905, + "step": 693, + "time_per_iteration": 3.327517509460449 + }, + { + "auxiliary_loss_clip": 0.01247305, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.07215595, + "balance_loss_mlp": 1.0325501, + "epoch": 0.08344856610352913, + "flos": 24717386655360.0, + "grad_norm": 1.901300033673213, + "language_loss": 0.68107098, + "learning_rate": 3.9701690785303896e-06, + "loss": 0.70399761, + "num_input_tokens_seen": 14656100, + "step": 694, + "time_per_iteration": 2.547891139984131 + }, + { + "auxiliary_loss_clip": 0.01250563, + "auxiliary_loss_mlp": 0.01042239, + "balance_loss_clip": 1.06959915, + "balance_loss_mlp": 1.02993703, + "epoch": 0.08356880899416821, + "flos": 25370206387200.0, + "grad_norm": 2.2751908071743117, + "language_loss": 0.88450772, + "learning_rate": 3.970034890618339e-06, + "loss": 0.90743572, + "num_input_tokens_seen": 14675790, + "step": 695, + "time_per_iteration": 3.253553867340088 + }, + { + "auxiliary_loss_clip": 0.01229564, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.06536865, + "balance_loss_mlp": 1.02533412, + "epoch": 0.08368905188480731, + "flos": 24353072962560.0, + "grad_norm": 1.9940795528678446, + "language_loss": 0.88021863, + "learning_rate": 3.969900403853562e-06, + "loss": 0.90288603, + "num_input_tokens_seen": 14694830, + "step": 696, + "time_per_iteration": 2.5634326934814453 + }, + { + "auxiliary_loss_clip": 0.01265723, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.07358658, + "balance_loss_mlp": 1.03993201, + "epoch": 0.08380929477544641, + "flos": 18037319656320.0, + "grad_norm": 1.5595384805472767, + "language_loss": 0.77765125, + "learning_rate": 3.96976561825646e-06, + "loss": 0.80083692, + "num_input_tokens_seen": 14711920, + "step": 697, + "time_per_iteration": 2.472980260848999 + }, + { + "auxiliary_loss_clip": 0.01198585, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.06354189, + "balance_loss_mlp": 1.0268271, + "epoch": 0.08392953766608549, + "flos": 26286287875200.0, + "grad_norm": 2.1700018346639642, + "language_loss": 0.87154067, + "learning_rate": 3.969630533847479e-06, + "loss": 0.89391136, + "num_input_tokens_seen": 14730880, + "step": 698, + "time_per_iteration": 2.649151563644409 + }, + { + "auxiliary_loss_clip": 0.01246313, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.06890726, + "balance_loss_mlp": 1.02702081, + "epoch": 0.08404978055672459, + "flos": 22492146170880.0, + "grad_norm": 2.1303424371162727, + "language_loss": 0.84012669, + "learning_rate": 3.969495150647113e-06, + "loss": 0.86298317, + "num_input_tokens_seen": 14749050, + "step": 699, + "time_per_iteration": 2.5055344104766846 + }, + { + "auxiliary_loss_clip": 0.01210867, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.06637263, + "balance_loss_mlp": 1.02541149, + "epoch": 0.08417002344736367, + "flos": 24826878288000.0, + "grad_norm": 1.7501081576618442, + "language_loss": 0.76538599, + "learning_rate": 3.969359468675899e-06, + "loss": 0.78786945, + "num_input_tokens_seen": 14769180, + "step": 700, + "time_per_iteration": 2.6334521770477295 + }, + { + "auxiliary_loss_clip": 0.01241755, + "auxiliary_loss_mlp": 0.01038554, + "balance_loss_clip": 1.06796682, + "balance_loss_mlp": 1.02598906, + "epoch": 0.08429026633800277, + "flos": 16945922862720.0, + "grad_norm": 2.099904727483784, + "language_loss": 0.89646226, + "learning_rate": 3.969223487954418e-06, + "loss": 0.91926539, + "num_input_tokens_seen": 14786640, + "step": 701, + "time_per_iteration": 2.493617057800293 + }, + { + "auxiliary_loss_clip": 0.01199374, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.06746805, + "balance_loss_mlp": 1.03342295, + "epoch": 0.08441050922864185, + "flos": 23841920471040.0, + "grad_norm": 2.2777933330410383, + "language_loss": 0.82761592, + "learning_rate": 3.969087208503301e-06, + "loss": 0.85006618, + "num_input_tokens_seen": 14806720, + "step": 702, + "time_per_iteration": 2.6200878620147705 + }, + { + "auxiliary_loss_clip": 0.01201294, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.06502533, + "balance_loss_mlp": 1.03105485, + "epoch": 0.08453075211928095, + "flos": 25520205582720.0, + "grad_norm": 2.426950836105903, + "language_loss": 0.84487534, + "learning_rate": 3.968950630343219e-06, + "loss": 0.8673228, + "num_input_tokens_seen": 14823705, + "step": 703, + "time_per_iteration": 2.6058740615844727 + }, + { + "auxiliary_loss_clip": 0.01226597, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.06400526, + "balance_loss_mlp": 1.03252172, + "epoch": 0.08465099500992004, + "flos": 19532496211200.0, + "grad_norm": 1.9550391882105205, + "language_loss": 0.93532425, + "learning_rate": 3.968813753494892e-06, + "loss": 0.95803642, + "num_input_tokens_seen": 14841865, + "step": 704, + "time_per_iteration": 2.548114538192749 + }, + { + "auxiliary_loss_clip": 0.01200686, + "auxiliary_loss_mlp": 0.00766973, + "balance_loss_clip": 1.05830419, + "balance_loss_mlp": 1.00026751, + "epoch": 0.08477123790055913, + "flos": 29351299403520.0, + "grad_norm": 2.5948731469874735, + "language_loss": 0.75383914, + "learning_rate": 3.968676577979084e-06, + "loss": 0.77351582, + "num_input_tokens_seen": 14861415, + "step": 705, + "time_per_iteration": 2.655470609664917 + }, + { + "auxiliary_loss_clip": 0.0118913, + "auxiliary_loss_mlp": 0.01050092, + "balance_loss_clip": 1.05848932, + "balance_loss_mlp": 1.03756881, + "epoch": 0.08489148079119822, + "flos": 18624495283200.0, + "grad_norm": 13.204197203013443, + "language_loss": 0.78260064, + "learning_rate": 3.968539103816605e-06, + "loss": 0.80499291, + "num_input_tokens_seen": 14879215, + "step": 706, + "time_per_iteration": 2.5678460597991943 + }, + { + "auxiliary_loss_clip": 0.01228512, + "auxiliary_loss_mlp": 0.00766381, + "balance_loss_clip": 1.06937885, + "balance_loss_mlp": 1.00028443, + "epoch": 0.0850117236818373, + "flos": 23471393725440.0, + "grad_norm": 1.8222376117592827, + "language_loss": 0.89680481, + "learning_rate": 3.9684013310283085e-06, + "loss": 0.91675377, + "num_input_tokens_seen": 14897900, + "step": 707, + "time_per_iteration": 2.6264874935150146 + }, + { + "auxiliary_loss_clip": 0.01224558, + "auxiliary_loss_mlp": 0.01050407, + "balance_loss_clip": 1.06930017, + "balance_loss_mlp": 1.03790176, + "epoch": 0.0851319665724764, + "flos": 40625058896640.0, + "grad_norm": 1.8237705906278956, + "language_loss": 0.63923478, + "learning_rate": 3.9682632596350956e-06, + "loss": 0.66198444, + "num_input_tokens_seen": 14919065, + "step": 708, + "time_per_iteration": 2.7082176208496094 + }, + { + "auxiliary_loss_clip": 0.01240995, + "auxiliary_loss_mlp": 0.01039717, + "balance_loss_clip": 1.06945312, + "balance_loss_mlp": 1.02738512, + "epoch": 0.0852522094631155, + "flos": 15879554870400.0, + "grad_norm": 2.556451634365381, + "language_loss": 0.78258598, + "learning_rate": 3.968124889657911e-06, + "loss": 0.80539304, + "num_input_tokens_seen": 14934165, + "step": 709, + "time_per_iteration": 2.4888973236083984 + }, + { + "auxiliary_loss_clip": 0.01193257, + "auxiliary_loss_mlp": 0.0104827, + "balance_loss_clip": 1.06094432, + "balance_loss_mlp": 1.03685558, + "epoch": 0.08537245235375458, + "flos": 14567091822720.0, + "grad_norm": 2.1951517339270437, + "language_loss": 0.90588379, + "learning_rate": 3.967986221117746e-06, + "loss": 0.92829907, + "num_input_tokens_seen": 14950105, + "step": 710, + "time_per_iteration": 2.567375421524048 + }, + { + "auxiliary_loss_clip": 0.01169918, + "auxiliary_loss_mlp": 0.01038101, + "balance_loss_clip": 1.05987644, + "balance_loss_mlp": 1.02613211, + "epoch": 0.08549269524439368, + "flos": 26468929555200.0, + "grad_norm": 1.762491144624555, + "language_loss": 0.86440134, + "learning_rate": 3.967847254035635e-06, + "loss": 0.88648152, + "num_input_tokens_seen": 14969490, + "step": 711, + "time_per_iteration": 2.763526201248169 + }, + { + "auxiliary_loss_clip": 0.01210719, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.0624547, + "balance_loss_mlp": 1.02914262, + "epoch": 0.08561293813503276, + "flos": 13590214565760.0, + "grad_norm": 2.1984781828452333, + "language_loss": 0.86634451, + "learning_rate": 3.967707988432661e-06, + "loss": 0.88886607, + "num_input_tokens_seen": 14987195, + "step": 712, + "time_per_iteration": 2.5990025997161865 + }, + { + "auxiliary_loss_clip": 0.01261284, + "auxiliary_loss_mlp": 0.01046279, + "balance_loss_clip": 1.06909609, + "balance_loss_mlp": 1.03321981, + "epoch": 0.08573318102567186, + "flos": 26943524979840.0, + "grad_norm": 2.3147700761463725, + "language_loss": 0.87544751, + "learning_rate": 3.967568424329949e-06, + "loss": 0.89852315, + "num_input_tokens_seen": 15007620, + "step": 713, + "time_per_iteration": 3.3077073097229004 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01011697, + "balance_loss_clip": 1.03357041, + "balance_loss_mlp": 1.00630915, + "epoch": 0.08585342391631094, + "flos": 67302739319040.0, + "grad_norm": 0.8213084999241819, + "language_loss": 0.55536014, + "learning_rate": 3.967428561748671e-06, + "loss": 0.57681483, + "num_input_tokens_seen": 15075590, + "step": 714, + "time_per_iteration": 3.2335643768310547 + }, + { + "auxiliary_loss_clip": 0.01188326, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.05715561, + "balance_loss_mlp": 1.03285527, + "epoch": 0.08597366680695004, + "flos": 22456594684800.0, + "grad_norm": 1.7662692165734508, + "language_loss": 0.87422907, + "learning_rate": 3.967288400710045e-06, + "loss": 0.8965739, + "num_input_tokens_seen": 15095055, + "step": 715, + "time_per_iteration": 2.6219892501831055 + }, + { + "auxiliary_loss_clip": 0.0120543, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.06550384, + "balance_loss_mlp": 1.02917147, + "epoch": 0.08609390969758914, + "flos": 23550505430400.0, + "grad_norm": 6.4115412399782326, + "language_loss": 0.8852272, + "learning_rate": 3.9671479412353335e-06, + "loss": 0.90770209, + "num_input_tokens_seen": 15113520, + "step": 716, + "time_per_iteration": 2.5895464420318604 + }, + { + "auxiliary_loss_clip": 0.01245746, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_clip": 1.07027149, + "balance_loss_mlp": 1.03395176, + "epoch": 0.08621415258822822, + "flos": 25885848078720.0, + "grad_norm": 2.286610213923094, + "language_loss": 0.74080592, + "learning_rate": 3.967007183345843e-06, + "loss": 0.76372826, + "num_input_tokens_seen": 15133375, + "step": 717, + "time_per_iteration": 2.538978338241577 + }, + { + "auxiliary_loss_clip": 0.01238085, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.06754804, + "balance_loss_mlp": 1.0286417, + "epoch": 0.08633439547886732, + "flos": 13589568120960.0, + "grad_norm": 5.229286347001644, + "language_loss": 0.89719927, + "learning_rate": 3.966866127062927e-06, + "loss": 0.91999024, + "num_input_tokens_seen": 15150500, + "step": 718, + "time_per_iteration": 3.2775516510009766 + }, + { + "auxiliary_loss_clip": 0.01138275, + "auxiliary_loss_mlp": 0.01004857, + "balance_loss_clip": 1.03685832, + "balance_loss_mlp": 0.99963522, + "epoch": 0.0864546383695064, + "flos": 57767342434560.0, + "grad_norm": 0.8650890060221531, + "language_loss": 0.62711769, + "learning_rate": 3.966724772407982e-06, + "loss": 0.64854902, + "num_input_tokens_seen": 15208015, + "step": 719, + "time_per_iteration": 3.7964155673980713 + }, + { + "auxiliary_loss_clip": 0.01203392, + "auxiliary_loss_mlp": 0.01040317, + "balance_loss_clip": 1.06223512, + "balance_loss_mlp": 1.02830648, + "epoch": 0.0865748812601455, + "flos": 20046952753920.0, + "grad_norm": 2.6066224651585856, + "language_loss": 0.88814062, + "learning_rate": 3.966583119402454e-06, + "loss": 0.91057777, + "num_input_tokens_seen": 15224780, + "step": 720, + "time_per_iteration": 2.5559685230255127 + }, + { + "auxiliary_loss_clip": 0.0123805, + "auxiliary_loss_mlp": 0.00766036, + "balance_loss_clip": 1.06715417, + "balance_loss_mlp": 1.0003736, + "epoch": 0.08669512415078459, + "flos": 35262446935680.0, + "grad_norm": 1.8037432730052723, + "language_loss": 0.82261783, + "learning_rate": 3.9664411680678305e-06, + "loss": 0.8426587, + "num_input_tokens_seen": 15246535, + "step": 721, + "time_per_iteration": 3.4028894901275635 + }, + { + "auxiliary_loss_clip": 0.0111008, + "auxiliary_loss_mlp": 0.0100618, + "balance_loss_clip": 1.03080273, + "balance_loss_mlp": 1.00062466, + "epoch": 0.08681536704142367, + "flos": 65654870048640.0, + "grad_norm": 0.846456965627966, + "language_loss": 0.61413199, + "learning_rate": 3.966298918425644e-06, + "loss": 0.63529462, + "num_input_tokens_seen": 15304025, + "step": 722, + "time_per_iteration": 3.0168802738189697 + }, + { + "auxiliary_loss_clip": 0.01242945, + "auxiliary_loss_mlp": 0.01044157, + "balance_loss_clip": 1.06585038, + "balance_loss_mlp": 1.03106213, + "epoch": 0.08693560993206277, + "flos": 34529940881280.0, + "grad_norm": 1.7754324483920783, + "language_loss": 0.8268522, + "learning_rate": 3.966156370497476e-06, + "loss": 0.84972322, + "num_input_tokens_seen": 15327635, + "step": 723, + "time_per_iteration": 2.634556293487549 + }, + { + "auxiliary_loss_clip": 0.01244077, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.0660311, + "balance_loss_mlp": 1.02744007, + "epoch": 0.08705585282270185, + "flos": 23149419189120.0, + "grad_norm": 1.7772117672542782, + "language_loss": 0.88431096, + "learning_rate": 3.96601352430495e-06, + "loss": 0.90714991, + "num_input_tokens_seen": 15347405, + "step": 724, + "time_per_iteration": 2.532996654510498 + }, + { + "auxiliary_loss_clip": 0.01226274, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_clip": 1.06837964, + "balance_loss_mlp": 1.03544569, + "epoch": 0.08717609571334095, + "flos": 29497599498240.0, + "grad_norm": 1.7896550083133873, + "language_loss": 0.83266312, + "learning_rate": 3.965870379869735e-06, + "loss": 0.85541016, + "num_input_tokens_seen": 15369450, + "step": 725, + "time_per_iteration": 2.60739803314209 + }, + { + "auxiliary_loss_clip": 0.01238696, + "auxiliary_loss_mlp": 0.01042638, + "balance_loss_clip": 1.06270838, + "balance_loss_mlp": 1.03075862, + "epoch": 0.08729633860398003, + "flos": 20667489137280.0, + "grad_norm": 2.141342025028866, + "language_loss": 0.86694145, + "learning_rate": 3.965726937213547e-06, + "loss": 0.88975489, + "num_input_tokens_seen": 15388085, + "step": 726, + "time_per_iteration": 2.5231199264526367 + }, + { + "auxiliary_loss_clip": 0.01239395, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_clip": 1.06296325, + "balance_loss_mlp": 1.03657281, + "epoch": 0.08741658149461913, + "flos": 18369493655040.0, + "grad_norm": 2.0968770109349633, + "language_loss": 0.81280845, + "learning_rate": 3.965583196358144e-06, + "loss": 0.83569664, + "num_input_tokens_seen": 15407120, + "step": 727, + "time_per_iteration": 2.6597909927368164 + }, + { + "auxiliary_loss_clip": 0.01260028, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_clip": 1.06877446, + "balance_loss_mlp": 1.02906823, + "epoch": 0.08753682438525823, + "flos": 18729677283840.0, + "grad_norm": 2.689444276680461, + "language_loss": 0.74456441, + "learning_rate": 3.965439157325335e-06, + "loss": 0.76759458, + "num_input_tokens_seen": 15424485, + "step": 728, + "time_per_iteration": 2.538700819015503 + }, + { + "auxiliary_loss_clip": 0.01218934, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.05975735, + "balance_loss_mlp": 1.02298844, + "epoch": 0.08765706727589731, + "flos": 27776113303680.0, + "grad_norm": 1.9871436051786833, + "language_loss": 0.76060796, + "learning_rate": 3.965294820136968e-06, + "loss": 0.78316665, + "num_input_tokens_seen": 15446285, + "step": 729, + "time_per_iteration": 2.654282331466675 + }, + { + "auxiliary_loss_clip": 0.01229199, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.06583357, + "balance_loss_mlp": 1.0264132, + "epoch": 0.08777731016653641, + "flos": 24389127239040.0, + "grad_norm": 2.2427772571083304, + "language_loss": 0.87208581, + "learning_rate": 3.965150184814938e-06, + "loss": 0.89476383, + "num_input_tokens_seen": 15465770, + "step": 730, + "time_per_iteration": 2.6372649669647217 + }, + { + "auxiliary_loss_clip": 0.01215954, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_clip": 1.06418157, + "balance_loss_mlp": 1.03205848, + "epoch": 0.08789755305717549, + "flos": 21981855605760.0, + "grad_norm": 3.629207783504735, + "language_loss": 0.76280951, + "learning_rate": 3.965005251381189e-06, + "loss": 0.78542101, + "num_input_tokens_seen": 15483705, + "step": 731, + "time_per_iteration": 2.636958360671997 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01005898, + "balance_loss_clip": 1.03007054, + "balance_loss_mlp": 1.00070095, + "epoch": 0.08801779594781459, + "flos": 58360120583040.0, + "grad_norm": 0.9085074219945398, + "language_loss": 0.64629811, + "learning_rate": 3.964860019857705e-06, + "loss": 0.66769254, + "num_input_tokens_seen": 15548620, + "step": 732, + "time_per_iteration": 3.1310997009277344 + }, + { + "auxiliary_loss_clip": 0.01260248, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.07243061, + "balance_loss_mlp": 1.03412759, + "epoch": 0.08813803883845367, + "flos": 23294785530240.0, + "grad_norm": 1.8580667711307786, + "language_loss": 0.84247398, + "learning_rate": 3.964714490266518e-06, + "loss": 0.86553162, + "num_input_tokens_seen": 15569265, + "step": 733, + "time_per_iteration": 2.5205700397491455 + }, + { + "auxiliary_loss_clip": 0.0112836, + "auxiliary_loss_mlp": 0.01007157, + "balance_loss_clip": 1.02875125, + "balance_loss_mlp": 1.00191176, + "epoch": 0.08825828172909277, + "flos": 63424924882560.0, + "grad_norm": 0.8963673826631723, + "language_loss": 0.6459381, + "learning_rate": 3.964568662629706e-06, + "loss": 0.66729331, + "num_input_tokens_seen": 15630570, + "step": 734, + "time_per_iteration": 3.007103204727173 + }, + { + "auxiliary_loss_clip": 0.01234452, + "auxiliary_loss_mlp": 0.01042552, + "balance_loss_clip": 1.06225801, + "balance_loss_mlp": 1.03089297, + "epoch": 0.08837852461973186, + "flos": 26720986268160.0, + "grad_norm": 3.237547215292283, + "language_loss": 0.84352016, + "learning_rate": 3.9644225369693895e-06, + "loss": 0.86629015, + "num_input_tokens_seen": 15650870, + "step": 735, + "time_per_iteration": 2.5715677738189697 + }, + { + "auxiliary_loss_clip": 0.01255842, + "auxiliary_loss_mlp": 0.01039215, + "balance_loss_clip": 1.06903756, + "balance_loss_mlp": 1.02711511, + "epoch": 0.08849876751037095, + "flos": 27265427688960.0, + "grad_norm": 1.9027809355522933, + "language_loss": 0.86782962, + "learning_rate": 3.964276113307735e-06, + "loss": 0.89078015, + "num_input_tokens_seen": 15670835, + "step": 736, + "time_per_iteration": 2.5143232345581055 + }, + { + "auxiliary_loss_clip": 0.012099, + "auxiliary_loss_mlp": 0.01050886, + "balance_loss_clip": 1.06421185, + "balance_loss_mlp": 1.0377847, + "epoch": 0.08861901040101004, + "flos": 19828759587840.0, + "grad_norm": 1.8659619759796535, + "language_loss": 0.80636764, + "learning_rate": 3.9641293916669574e-06, + "loss": 0.8289755, + "num_input_tokens_seen": 15689795, + "step": 737, + "time_per_iteration": 2.5595498085021973 + }, + { + "auxiliary_loss_clip": 0.01205053, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.06331253, + "balance_loss_mlp": 1.02405179, + "epoch": 0.08873925329164913, + "flos": 23658704173440.0, + "grad_norm": 1.8882247729683301, + "language_loss": 0.82776213, + "learning_rate": 3.9639823720693115e-06, + "loss": 0.85018617, + "num_input_tokens_seen": 15711650, + "step": 738, + "time_per_iteration": 2.6314258575439453 + }, + { + "auxiliary_loss_clip": 0.01109011, + "auxiliary_loss_mlp": 0.01019373, + "balance_loss_clip": 1.04003358, + "balance_loss_mlp": 1.01491451, + "epoch": 0.08885949618228822, + "flos": 71831541893760.0, + "grad_norm": 0.8482495326162162, + "language_loss": 0.59987473, + "learning_rate": 3.963835054537102e-06, + "loss": 0.62115854, + "num_input_tokens_seen": 15780615, + "step": 739, + "time_per_iteration": 3.193509817123413 + }, + { + "auxiliary_loss_clip": 0.01220775, + "auxiliary_loss_mlp": 0.01055811, + "balance_loss_clip": 1.06141114, + "balance_loss_mlp": 1.0437355, + "epoch": 0.08897973907292732, + "flos": 22346169298560.0, + "grad_norm": 2.2139264102670073, + "language_loss": 0.61381447, + "learning_rate": 3.963687439092676e-06, + "loss": 0.63658041, + "num_input_tokens_seen": 15801300, + "step": 740, + "time_per_iteration": 3.3782408237457275 + }, + { + "auxiliary_loss_clip": 0.01240986, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.06674385, + "balance_loss_mlp": 1.03450692, + "epoch": 0.0890999819635664, + "flos": 21251827589760.0, + "grad_norm": 3.9173797294777475, + "language_loss": 0.80378187, + "learning_rate": 3.963539525758427e-06, + "loss": 0.8266589, + "num_input_tokens_seen": 15820860, + "step": 741, + "time_per_iteration": 2.5305728912353516 + }, + { + "auxiliary_loss_clip": 0.01226877, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.0660212, + "balance_loss_mlp": 1.0269649, + "epoch": 0.0892202248542055, + "flos": 25370888745600.0, + "grad_norm": 2.430622964395486, + "language_loss": 0.67870855, + "learning_rate": 3.9633913145567925e-06, + "loss": 0.7013765, + "num_input_tokens_seen": 15841350, + "step": 742, + "time_per_iteration": 2.564605951309204 + }, + { + "auxiliary_loss_clip": 0.01225065, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.06720603, + "balance_loss_mlp": 1.02282858, + "epoch": 0.08934046774484458, + "flos": 24457895827200.0, + "grad_norm": 2.2087618900411017, + "language_loss": 0.81364214, + "learning_rate": 3.9632428055102575e-06, + "loss": 0.83623862, + "num_input_tokens_seen": 15861360, + "step": 743, + "time_per_iteration": 2.5781798362731934 + }, + { + "auxiliary_loss_clip": 0.01246759, + "auxiliary_loss_mlp": 0.01046256, + "balance_loss_clip": 1.07117701, + "balance_loss_mlp": 1.03242755, + "epoch": 0.08946071063548368, + "flos": 35772773414400.0, + "grad_norm": 2.4649839207881596, + "language_loss": 0.66916597, + "learning_rate": 3.9630939986413495e-06, + "loss": 0.69209611, + "num_input_tokens_seen": 15883160, + "step": 744, + "time_per_iteration": 2.6227333545684814 + }, + { + "auxiliary_loss_clip": 0.01197922, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.06317616, + "balance_loss_mlp": 1.0368731, + "epoch": 0.08958095352612276, + "flos": 14356584167040.0, + "grad_norm": 1.8499385600165459, + "language_loss": 0.78343606, + "learning_rate": 3.962944893972643e-06, + "loss": 0.80591255, + "num_input_tokens_seen": 15901610, + "step": 745, + "time_per_iteration": 3.3605949878692627 + }, + { + "auxiliary_loss_clip": 0.01223982, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.06447363, + "balance_loss_mlp": 1.03206921, + "epoch": 0.08970119641676186, + "flos": 17853277345920.0, + "grad_norm": 3.0551993831848607, + "language_loss": 0.91105425, + "learning_rate": 3.962795491526756e-06, + "loss": 0.93374175, + "num_input_tokens_seen": 15918770, + "step": 746, + "time_per_iteration": 3.4027154445648193 + }, + { + "auxiliary_loss_clip": 0.01264162, + "auxiliary_loss_mlp": 0.01056027, + "balance_loss_clip": 1.07289803, + "balance_loss_mlp": 1.04190671, + "epoch": 0.08982143930740095, + "flos": 20811670329600.0, + "grad_norm": 2.1750555823974267, + "language_loss": 0.89316124, + "learning_rate": 3.962645791326354e-06, + "loss": 0.91636312, + "num_input_tokens_seen": 15938025, + "step": 747, + "time_per_iteration": 2.490537405014038 + }, + { + "auxiliary_loss_clip": 0.01239971, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.06860781, + "balance_loss_mlp": 1.02948022, + "epoch": 0.08994168219804004, + "flos": 24097712198400.0, + "grad_norm": 1.9740534642002414, + "language_loss": 0.82780039, + "learning_rate": 3.962495793394146e-06, + "loss": 0.85061109, + "num_input_tokens_seen": 15957215, + "step": 748, + "time_per_iteration": 3.3045437335968018 + }, + { + "auxiliary_loss_clip": 0.01143532, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.03070569, + "balance_loss_mlp": 1.03584719, + "epoch": 0.09006192508867913, + "flos": 57188893812480.0, + "grad_norm": 0.7418661616341179, + "language_loss": 0.61205, + "learning_rate": 3.9623454977528864e-06, + "loss": 0.63389093, + "num_input_tokens_seen": 16015870, + "step": 749, + "time_per_iteration": 2.89220929145813 + }, + { + "auxiliary_loss_clip": 0.01214954, + "auxiliary_loss_mlp": 0.01051043, + "balance_loss_clip": 1.06468487, + "balance_loss_mlp": 1.03827572, + "epoch": 0.09018216797931822, + "flos": 20487505063680.0, + "grad_norm": 1.638919864447389, + "language_loss": 0.85080326, + "learning_rate": 3.962194904425375e-06, + "loss": 0.87346327, + "num_input_tokens_seen": 16036500, + "step": 750, + "time_per_iteration": 2.6000778675079346 + }, + { + "auxiliary_loss_clip": 0.01236507, + "auxiliary_loss_mlp": 0.01042042, + "balance_loss_clip": 1.06613481, + "balance_loss_mlp": 1.029459, + "epoch": 0.09030241086995731, + "flos": 22638123043200.0, + "grad_norm": 3.524315116278463, + "language_loss": 0.6796189, + "learning_rate": 3.9620440134344566e-06, + "loss": 0.70240438, + "num_input_tokens_seen": 16054655, + "step": 751, + "time_per_iteration": 2.5078063011169434 + }, + { + "auxiliary_loss_clip": 0.01207956, + "auxiliary_loss_mlp": 0.01050457, + "balance_loss_clip": 1.0646832, + "balance_loss_mlp": 1.03704023, + "epoch": 0.09042265376059641, + "flos": 21871502046720.0, + "grad_norm": 2.90199615579018, + "language_loss": 0.82166553, + "learning_rate": 3.9618928248030215e-06, + "loss": 0.84424967, + "num_input_tokens_seen": 16074165, + "step": 752, + "time_per_iteration": 2.5974485874176025 + }, + { + "auxiliary_loss_clip": 0.01239493, + "auxiliary_loss_mlp": 0.01049187, + "balance_loss_clip": 1.06807065, + "balance_loss_mlp": 1.03687274, + "epoch": 0.0905428966512355, + "flos": 24316192673280.0, + "grad_norm": 2.1489688556392674, + "language_loss": 0.83093143, + "learning_rate": 3.961741338554005e-06, + "loss": 0.85381818, + "num_input_tokens_seen": 16092505, + "step": 753, + "time_per_iteration": 2.5624661445617676 + }, + { + "auxiliary_loss_clip": 0.01233045, + "auxiliary_loss_mlp": 0.01055188, + "balance_loss_clip": 1.06765342, + "balance_loss_mlp": 1.04195607, + "epoch": 0.09066313954187459, + "flos": 35845061535360.0, + "grad_norm": 2.1602943930785923, + "language_loss": 0.75770986, + "learning_rate": 3.9615895547103865e-06, + "loss": 0.7805922, + "num_input_tokens_seen": 16116150, + "step": 754, + "time_per_iteration": 2.742488384246826 + }, + { + "auxiliary_loss_clip": 0.01223138, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_clip": 1.06320918, + "balance_loss_mlp": 1.03761959, + "epoch": 0.09078338243251367, + "flos": 29168729550720.0, + "grad_norm": 1.839582196697024, + "language_loss": 0.77699804, + "learning_rate": 3.961437473295193e-06, + "loss": 0.79973447, + "num_input_tokens_seen": 16136295, + "step": 755, + "time_per_iteration": 2.5963313579559326 + }, + { + "auxiliary_loss_clip": 0.01178461, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_clip": 1.05443311, + "balance_loss_mlp": 1.03419936, + "epoch": 0.09090362532315277, + "flos": 21907699977600.0, + "grad_norm": 2.41548011638984, + "language_loss": 0.72252071, + "learning_rate": 3.961285094331495e-06, + "loss": 0.74477232, + "num_input_tokens_seen": 16154210, + "step": 756, + "time_per_iteration": 2.641754388809204 + }, + { + "auxiliary_loss_clip": 0.01251217, + "auxiliary_loss_mlp": 0.01041559, + "balance_loss_clip": 1.06597853, + "balance_loss_mlp": 1.029948, + "epoch": 0.09102386821379185, + "flos": 27344503480320.0, + "grad_norm": 4.562012824283252, + "language_loss": 0.86085641, + "learning_rate": 3.961132417842406e-06, + "loss": 0.88378417, + "num_input_tokens_seen": 16173995, + "step": 757, + "time_per_iteration": 2.521122455596924 + }, + { + "auxiliary_loss_clip": 0.01232284, + "auxiliary_loss_mlp": 0.01050963, + "balance_loss_clip": 1.06513047, + "balance_loss_mlp": 1.03897023, + "epoch": 0.09114411110443095, + "flos": 20813501923200.0, + "grad_norm": 3.489379949445828, + "language_loss": 0.75569755, + "learning_rate": 3.960979443851089e-06, + "loss": 0.77853, + "num_input_tokens_seen": 16191020, + "step": 758, + "time_per_iteration": 2.5540335178375244 + }, + { + "auxiliary_loss_clip": 0.01222789, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.06476259, + "balance_loss_mlp": 1.02890193, + "epoch": 0.09126435399507005, + "flos": 26145949438080.0, + "grad_norm": 1.712151297559235, + "language_loss": 0.7875455, + "learning_rate": 3.96082617238075e-06, + "loss": 0.81019622, + "num_input_tokens_seen": 16213645, + "step": 759, + "time_per_iteration": 2.618605136871338 + }, + { + "auxiliary_loss_clip": 0.0122199, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.06349874, + "balance_loss_mlp": 1.02709365, + "epoch": 0.09138459688570913, + "flos": 24388911757440.0, + "grad_norm": 2.581203286872189, + "language_loss": 0.79955423, + "learning_rate": 3.960672603454639e-06, + "loss": 0.82215995, + "num_input_tokens_seen": 16233625, + "step": 760, + "time_per_iteration": 2.5661139488220215 + }, + { + "auxiliary_loss_clip": 0.01234321, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_clip": 1.06649208, + "balance_loss_mlp": 1.03436661, + "epoch": 0.09150483977634823, + "flos": 21032664756480.0, + "grad_norm": 2.908520015453707, + "language_loss": 0.77360773, + "learning_rate": 3.960518737096054e-06, + "loss": 0.7964251, + "num_input_tokens_seen": 16253255, + "step": 761, + "time_per_iteration": 2.5108797550201416 + }, + { + "auxiliary_loss_clip": 0.01239668, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.06747317, + "balance_loss_mlp": 1.02695203, + "epoch": 0.09162508266698731, + "flos": 22856998567680.0, + "grad_norm": 2.1656183213501445, + "language_loss": 0.72762758, + "learning_rate": 3.960364573328334e-06, + "loss": 0.75041914, + "num_input_tokens_seen": 16272580, + "step": 762, + "time_per_iteration": 2.537421941757202 + }, + { + "auxiliary_loss_clip": 0.01208382, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.06091428, + "balance_loss_mlp": 1.02698278, + "epoch": 0.0917453255576264, + "flos": 21724411852800.0, + "grad_norm": 1.9035893829647503, + "language_loss": 0.8847158, + "learning_rate": 3.9602101121748675e-06, + "loss": 0.90720385, + "num_input_tokens_seen": 16293075, + "step": 763, + "time_per_iteration": 2.5745084285736084 + }, + { + "auxiliary_loss_clip": 0.01223701, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_clip": 1.06795657, + "balance_loss_mlp": 1.03285289, + "epoch": 0.0918655684482655, + "flos": 14609215497600.0, + "grad_norm": 3.0841465319861605, + "language_loss": 0.72333789, + "learning_rate": 3.960055353659085e-06, + "loss": 0.7460174, + "num_input_tokens_seen": 16310185, + "step": 764, + "time_per_iteration": 2.49641489982605 + }, + { + "auxiliary_loss_clip": 0.01211061, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.06531608, + "balance_loss_mlp": 1.02291179, + "epoch": 0.09198581133890459, + "flos": 23435016226560.0, + "grad_norm": 1.8135206928884973, + "language_loss": 0.83774364, + "learning_rate": 3.959900297804465e-06, + "loss": 0.86020637, + "num_input_tokens_seen": 16330355, + "step": 765, + "time_per_iteration": 2.6304118633270264 + }, + { + "auxiliary_loss_clip": 0.01210006, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.06236756, + "balance_loss_mlp": 1.02711332, + "epoch": 0.09210605422954368, + "flos": 16795887753600.0, + "grad_norm": 1.7923047136102985, + "language_loss": 0.77408326, + "learning_rate": 3.9597449446345276e-06, + "loss": 0.79657596, + "num_input_tokens_seen": 16347600, + "step": 766, + "time_per_iteration": 2.6017959117889404 + }, + { + "auxiliary_loss_clip": 0.01209076, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.05900049, + "balance_loss_mlp": 1.02579629, + "epoch": 0.09222629712018277, + "flos": 22674249146880.0, + "grad_norm": 2.241517571459111, + "language_loss": 0.8341403, + "learning_rate": 3.95958929417284e-06, + "loss": 0.85660541, + "num_input_tokens_seen": 16365755, + "step": 767, + "time_per_iteration": 3.4614627361297607 + }, + { + "auxiliary_loss_clip": 0.01131984, + "auxiliary_loss_mlp": 0.0100879, + "balance_loss_clip": 1.0303998, + "balance_loss_mlp": 1.00435531, + "epoch": 0.09234654001082186, + "flos": 69976756327680.0, + "grad_norm": 0.7498201645231182, + "language_loss": 0.58803904, + "learning_rate": 3.9594333464430145e-06, + "loss": 0.60944676, + "num_input_tokens_seen": 16435245, + "step": 768, + "time_per_iteration": 3.232046365737915 + }, + { + "auxiliary_loss_clip": 0.0114631, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.05068898, + "balance_loss_mlp": 1.03140604, + "epoch": 0.09246678290146094, + "flos": 20011437181440.0, + "grad_norm": 1.877993792484274, + "language_loss": 0.88569391, + "learning_rate": 3.959277101468709e-06, + "loss": 0.90758014, + "num_input_tokens_seen": 16454795, + "step": 769, + "time_per_iteration": 2.844501495361328 + }, + { + "auxiliary_loss_clip": 0.01205583, + "auxiliary_loss_mlp": 0.01048922, + "balance_loss_clip": 1.06103909, + "balance_loss_mlp": 1.03714979, + "epoch": 0.09258702579210004, + "flos": 17747448900480.0, + "grad_norm": 2.381511027482938, + "language_loss": 0.78577614, + "learning_rate": 3.959120559273624e-06, + "loss": 0.80832124, + "num_input_tokens_seen": 16472580, + "step": 770, + "time_per_iteration": 2.8863327503204346 + }, + { + "auxiliary_loss_clip": 0.01205286, + "auxiliary_loss_mlp": 0.01042845, + "balance_loss_clip": 1.0624392, + "balance_loss_mlp": 1.0311209, + "epoch": 0.09270726868273914, + "flos": 20886544229760.0, + "grad_norm": 1.7712475182230991, + "language_loss": 0.83321321, + "learning_rate": 3.958963719881509e-06, + "loss": 0.85569459, + "num_input_tokens_seen": 16490670, + "step": 771, + "time_per_iteration": 3.3953518867492676 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.06904769, + "balance_loss_mlp": 1.02334261, + "epoch": 0.09282751157337822, + "flos": 17015697031680.0, + "grad_norm": 2.4160427784363647, + "language_loss": 0.93895793, + "learning_rate": 3.958806583316154e-06, + "loss": 0.96170247, + "num_input_tokens_seen": 16508640, + "step": 772, + "time_per_iteration": 2.5044260025024414 + }, + { + "auxiliary_loss_clip": 0.01253304, + "auxiliary_loss_mlp": 0.01032895, + "balance_loss_clip": 1.07007515, + "balance_loss_mlp": 1.02168894, + "epoch": 0.09294775446401732, + "flos": 32523647748480.0, + "grad_norm": 2.2935244338408043, + "language_loss": 0.78857535, + "learning_rate": 3.9586491496013985e-06, + "loss": 0.81143737, + "num_input_tokens_seen": 16531035, + "step": 773, + "time_per_iteration": 3.506129503250122 + }, + { + "auxiliary_loss_clip": 0.01242441, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.06782627, + "balance_loss_mlp": 1.03559327, + "epoch": 0.0930679973546564, + "flos": 18259750627200.0, + "grad_norm": 2.443343993646761, + "language_loss": 0.82959175, + "learning_rate": 3.958491418761124e-06, + "loss": 0.85248786, + "num_input_tokens_seen": 16548605, + "step": 774, + "time_per_iteration": 2.484351873397827 + }, + { + "auxiliary_loss_clip": 0.01220364, + "auxiliary_loss_mlp": 0.01039879, + "balance_loss_clip": 1.060323, + "balance_loss_mlp": 1.02854264, + "epoch": 0.0931882402452955, + "flos": 21099745405440.0, + "grad_norm": 3.511223698697841, + "language_loss": 0.72452903, + "learning_rate": 3.958333390819258e-06, + "loss": 0.74713147, + "num_input_tokens_seen": 16565535, + "step": 775, + "time_per_iteration": 3.320323944091797 + }, + { + "auxiliary_loss_clip": 0.01252348, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.06885231, + "balance_loss_mlp": 1.02976751, + "epoch": 0.0933084831359346, + "flos": 24207275658240.0, + "grad_norm": 2.162574745285735, + "language_loss": 0.80319798, + "learning_rate": 3.9581750657997754e-06, + "loss": 0.82613021, + "num_input_tokens_seen": 16584900, + "step": 776, + "time_per_iteration": 2.5163486003875732 + }, + { + "auxiliary_loss_clip": 0.01218066, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.06113338, + "balance_loss_mlp": 1.02718973, + "epoch": 0.09342872602657368, + "flos": 25480272637440.0, + "grad_norm": 1.8029756803353811, + "language_loss": 0.89400685, + "learning_rate": 3.95801644372669e-06, + "loss": 0.91657108, + "num_input_tokens_seen": 16604805, + "step": 777, + "time_per_iteration": 2.582204818725586 + }, + { + "auxiliary_loss_clip": 0.01227318, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.06274056, + "balance_loss_mlp": 1.03037691, + "epoch": 0.09354896891721277, + "flos": 23149060053120.0, + "grad_norm": 2.7455108891514133, + "language_loss": 0.84638035, + "learning_rate": 3.957857524624068e-06, + "loss": 0.86907291, + "num_input_tokens_seen": 16623685, + "step": 778, + "time_per_iteration": 2.5764808654785156 + }, + { + "auxiliary_loss_clip": 0.01220554, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.06451082, + "balance_loss_mlp": 1.02994251, + "epoch": 0.09366921180785186, + "flos": 24279563779200.0, + "grad_norm": 1.7713504253024963, + "language_loss": 0.89744759, + "learning_rate": 3.957698308516016e-06, + "loss": 0.92006528, + "num_input_tokens_seen": 16644985, + "step": 779, + "time_per_iteration": 2.5860352516174316 + }, + { + "auxiliary_loss_clip": 0.01232785, + "auxiliary_loss_mlp": 0.00765174, + "balance_loss_clip": 1.06731415, + "balance_loss_mlp": 1.00008166, + "epoch": 0.09378945469849095, + "flos": 18730036419840.0, + "grad_norm": 1.9528626408057206, + "language_loss": 0.82340336, + "learning_rate": 3.957538795426688e-06, + "loss": 0.84338295, + "num_input_tokens_seen": 16662410, + "step": 780, + "time_per_iteration": 2.4987144470214844 + }, + { + "auxiliary_loss_clip": 0.01222187, + "auxiliary_loss_mlp": 0.01045749, + "balance_loss_clip": 1.06392038, + "balance_loss_mlp": 1.0328387, + "epoch": 0.09390969758913004, + "flos": 23218834222080.0, + "grad_norm": 2.775819738756101, + "language_loss": 0.7731142, + "learning_rate": 3.9573789853802804e-06, + "loss": 0.79579353, + "num_input_tokens_seen": 16680885, + "step": 781, + "time_per_iteration": 2.5398433208465576 + }, + { + "auxiliary_loss_clip": 0.01222441, + "auxiliary_loss_mlp": 0.00764928, + "balance_loss_clip": 1.0673008, + "balance_loss_mlp": 1.00009966, + "epoch": 0.09402994047976913, + "flos": 19646728439040.0, + "grad_norm": 2.2231868911922863, + "language_loss": 0.75059783, + "learning_rate": 3.957218878401037e-06, + "loss": 0.77047157, + "num_input_tokens_seen": 16699375, + "step": 782, + "time_per_iteration": 2.52759051322937 + }, + { + "auxiliary_loss_clip": 0.01253558, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.06937087, + "balance_loss_mlp": 1.03211188, + "epoch": 0.09415018337040823, + "flos": 29420463041280.0, + "grad_norm": 2.1228779026288866, + "language_loss": 0.89221221, + "learning_rate": 3.957058474513246e-06, + "loss": 0.91519105, + "num_input_tokens_seen": 16719230, + "step": 783, + "time_per_iteration": 2.536959409713745 + }, + { + "auxiliary_loss_clip": 0.0123381, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.06738698, + "balance_loss_mlp": 1.03640795, + "epoch": 0.09427042626104731, + "flos": 24572092141440.0, + "grad_norm": 2.098397684064178, + "language_loss": 0.78701937, + "learning_rate": 3.956897773741241e-06, + "loss": 0.80982971, + "num_input_tokens_seen": 16738220, + "step": 784, + "time_per_iteration": 2.520097494125366 + }, + { + "auxiliary_loss_clip": 0.01211045, + "auxiliary_loss_mlp": 0.01045078, + "balance_loss_clip": 1.06269038, + "balance_loss_mlp": 1.03306139, + "epoch": 0.09439066915168641, + "flos": 26359581576960.0, + "grad_norm": 1.803975054384304, + "language_loss": 0.71477062, + "learning_rate": 3.956736776109398e-06, + "loss": 0.73733187, + "num_input_tokens_seen": 16759395, + "step": 785, + "time_per_iteration": 2.5932247638702393 + }, + { + "auxiliary_loss_clip": 0.01227363, + "auxiliary_loss_mlp": 0.00765931, + "balance_loss_clip": 1.06248522, + "balance_loss_mlp": 1.00014472, + "epoch": 0.09451091204232549, + "flos": 19427278296960.0, + "grad_norm": 1.96090050861924, + "language_loss": 0.83667952, + "learning_rate": 3.956575481642143e-06, + "loss": 0.85661244, + "num_input_tokens_seen": 16778285, + "step": 786, + "time_per_iteration": 2.526232957839966 + }, + { + "auxiliary_loss_clip": 0.01180531, + "auxiliary_loss_mlp": 0.01035462, + "balance_loss_clip": 1.05511522, + "balance_loss_mlp": 1.02349329, + "epoch": 0.09463115493296459, + "flos": 25368051571200.0, + "grad_norm": 2.9720776375749427, + "language_loss": 0.74837685, + "learning_rate": 3.956413890363943e-06, + "loss": 0.77053678, + "num_input_tokens_seen": 16795265, + "step": 787, + "time_per_iteration": 2.658855676651001 + }, + { + "auxiliary_loss_clip": 0.01234635, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.06727445, + "balance_loss_mlp": 1.02813196, + "epoch": 0.09475139782360369, + "flos": 10123254869760.0, + "grad_norm": 1.9638130848844293, + "language_loss": 0.81917673, + "learning_rate": 3.956252002299312e-06, + "loss": 0.84191531, + "num_input_tokens_seen": 16811165, + "step": 788, + "time_per_iteration": 2.5442371368408203 + }, + { + "auxiliary_loss_clip": 0.01251322, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.06852293, + "balance_loss_mlp": 1.02258253, + "epoch": 0.09487164071424277, + "flos": 17231088936960.0, + "grad_norm": 1.8412229805445395, + "language_loss": 0.90676838, + "learning_rate": 3.956089817472807e-06, + "loss": 0.92962176, + "num_input_tokens_seen": 16828470, + "step": 789, + "time_per_iteration": 2.519622802734375 + }, + { + "auxiliary_loss_clip": 0.01220214, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.06710017, + "balance_loss_mlp": 1.02788401, + "epoch": 0.09499188360488187, + "flos": 30849564528000.0, + "grad_norm": 2.086778008739669, + "language_loss": 0.85708684, + "learning_rate": 3.955927335909032e-06, + "loss": 0.87967861, + "num_input_tokens_seen": 16851680, + "step": 790, + "time_per_iteration": 2.6684296131134033 + }, + { + "auxiliary_loss_clip": 0.01188987, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.06550288, + "balance_loss_mlp": 1.02929318, + "epoch": 0.09511212649552095, + "flos": 29351694453120.0, + "grad_norm": 2.037735741394394, + "language_loss": 0.75897175, + "learning_rate": 3.955764557632634e-06, + "loss": 0.78126538, + "num_input_tokens_seen": 16871490, + "step": 791, + "time_per_iteration": 2.6928322315216064 + }, + { + "auxiliary_loss_clip": 0.01215442, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.06282735, + "balance_loss_mlp": 1.02656031, + "epoch": 0.09523236938616005, + "flos": 10378687461120.0, + "grad_norm": 2.083291490242172, + "language_loss": 0.94643569, + "learning_rate": 3.955601482668309e-06, + "loss": 0.96897125, + "num_input_tokens_seen": 16889350, + "step": 792, + "time_per_iteration": 2.5200576782226562 + }, + { + "auxiliary_loss_clip": 0.01183547, + "auxiliary_loss_mlp": 0.01040419, + "balance_loss_clip": 1.05602777, + "balance_loss_mlp": 1.02699637, + "epoch": 0.09535261227679913, + "flos": 19061815368960.0, + "grad_norm": 1.7384333115757473, + "language_loss": 0.88378692, + "learning_rate": 3.955438111040794e-06, + "loss": 0.9060266, + "num_input_tokens_seen": 16907625, + "step": 793, + "time_per_iteration": 2.6130173206329346 + }, + { + "auxiliary_loss_clip": 0.01183201, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_clip": 1.05778408, + "balance_loss_mlp": 1.034199, + "epoch": 0.09547285516743823, + "flos": 20922993555840.0, + "grad_norm": 2.007447161959936, + "language_loss": 0.79972219, + "learning_rate": 3.955274442774873e-06, + "loss": 0.82200551, + "num_input_tokens_seen": 16926205, + "step": 794, + "time_per_iteration": 3.4259731769561768 + }, + { + "auxiliary_loss_clip": 0.01234252, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.06554341, + "balance_loss_mlp": 1.03181863, + "epoch": 0.09559309805807732, + "flos": 30154405639680.0, + "grad_norm": 2.8176491728850506, + "language_loss": 0.70629764, + "learning_rate": 3.9551104778953725e-06, + "loss": 0.72908038, + "num_input_tokens_seen": 16946500, + "step": 795, + "time_per_iteration": 2.583831310272217 + }, + { + "auxiliary_loss_clip": 0.01203554, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.06006777, + "balance_loss_mlp": 1.02169394, + "epoch": 0.0957133409487164, + "flos": 21066743784960.0, + "grad_norm": 1.8080487606794478, + "language_loss": 0.85488039, + "learning_rate": 3.954946216427167e-06, + "loss": 0.87724489, + "num_input_tokens_seen": 16966960, + "step": 796, + "time_per_iteration": 2.593061923980713 + }, + { + "auxiliary_loss_clip": 0.01102803, + "auxiliary_loss_mlp": 0.01008399, + "balance_loss_clip": 1.0285213, + "balance_loss_mlp": 1.0036782, + "epoch": 0.0958335838393555, + "flos": 71297979315840.0, + "grad_norm": 0.9598719269203762, + "language_loss": 0.61594796, + "learning_rate": 3.954781658395176e-06, + "loss": 0.63705993, + "num_input_tokens_seen": 17023215, + "step": 797, + "time_per_iteration": 3.097337245941162 + }, + { + "auxiliary_loss_clip": 0.01224686, + "auxiliary_loss_mlp": 0.01038883, + "balance_loss_clip": 1.06469059, + "balance_loss_mlp": 1.02677083, + "epoch": 0.09595382672999458, + "flos": 21872974504320.0, + "grad_norm": 3.0385188852364826, + "language_loss": 0.92234981, + "learning_rate": 3.95461680382436e-06, + "loss": 0.94498551, + "num_input_tokens_seen": 17042140, + "step": 798, + "time_per_iteration": 3.3518388271331787 + }, + { + "auxiliary_loss_clip": 0.01240478, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_clip": 1.06897712, + "balance_loss_mlp": 1.02977157, + "epoch": 0.09607406962063368, + "flos": 18695562341760.0, + "grad_norm": 3.224595268756587, + "language_loss": 0.86197603, + "learning_rate": 3.9544516527397295e-06, + "loss": 0.88480151, + "num_input_tokens_seen": 17058490, + "step": 799, + "time_per_iteration": 2.55964994430542 + }, + { + "auxiliary_loss_clip": 0.01204065, + "auxiliary_loss_mlp": 0.01035895, + "balance_loss_clip": 1.06159639, + "balance_loss_mlp": 1.02425396, + "epoch": 0.09619431251127276, + "flos": 22568456615040.0, + "grad_norm": 2.21245382892145, + "language_loss": 0.80572426, + "learning_rate": 3.954286205166338e-06, + "loss": 0.82812387, + "num_input_tokens_seen": 17079655, + "step": 800, + "time_per_iteration": 3.525792360305786 + }, + { + "auxiliary_loss_clip": 0.012428, + "auxiliary_loss_mlp": 0.01043224, + "balance_loss_clip": 1.07241571, + "balance_loss_mlp": 1.03049839, + "epoch": 0.09631455540191186, + "flos": 14246230608000.0, + "grad_norm": 8.926001993125734, + "language_loss": 0.83849692, + "learning_rate": 3.954120461129282e-06, + "loss": 0.86135709, + "num_input_tokens_seen": 17097065, + "step": 801, + "time_per_iteration": 3.286698818206787 + }, + { + "auxiliary_loss_clip": 0.01256027, + "auxiliary_loss_mlp": 0.010466, + "balance_loss_clip": 1.07260573, + "balance_loss_mlp": 1.03525114, + "epoch": 0.09643479829255096, + "flos": 20740387789440.0, + "grad_norm": 2.315680098338758, + "language_loss": 0.84151202, + "learning_rate": 3.953954420653706e-06, + "loss": 0.86453825, + "num_input_tokens_seen": 17114090, + "step": 802, + "time_per_iteration": 2.5248186588287354 + }, + { + "auxiliary_loss_clip": 0.01232975, + "auxiliary_loss_mlp": 0.01043224, + "balance_loss_clip": 1.06682491, + "balance_loss_mlp": 1.03179741, + "epoch": 0.09655504118319004, + "flos": 24420476833920.0, + "grad_norm": 2.0181071838890965, + "language_loss": 0.88420212, + "learning_rate": 3.953788083764798e-06, + "loss": 0.90696412, + "num_input_tokens_seen": 17133325, + "step": 803, + "time_per_iteration": 2.5352556705474854 + }, + { + "auxiliary_loss_clip": 0.01189739, + "auxiliary_loss_mlp": 0.01048035, + "balance_loss_clip": 1.06242228, + "balance_loss_mlp": 1.0364002, + "epoch": 0.09667528407382914, + "flos": 18441961344000.0, + "grad_norm": 2.496995321964076, + "language_loss": 0.92321628, + "learning_rate": 3.953621450487792e-06, + "loss": 0.94559401, + "num_input_tokens_seen": 17151945, + "step": 804, + "time_per_iteration": 2.595557928085327 + }, + { + "auxiliary_loss_clip": 0.01142567, + "auxiliary_loss_mlp": 0.0101372, + "balance_loss_clip": 1.03245211, + "balance_loss_mlp": 1.00902319, + "epoch": 0.09679552696446822, + "flos": 70816455544320.0, + "grad_norm": 0.8332030199490637, + "language_loss": 0.61185551, + "learning_rate": 3.953454520847964e-06, + "loss": 0.63341832, + "num_input_tokens_seen": 17216790, + "step": 805, + "time_per_iteration": 3.22357439994812 + }, + { + "auxiliary_loss_clip": 0.0121738, + "auxiliary_loss_mlp": 0.01045988, + "balance_loss_clip": 1.06415582, + "balance_loss_mlp": 1.03172421, + "epoch": 0.09691576985510732, + "flos": 21945514020480.0, + "grad_norm": 1.9788814838925464, + "language_loss": 0.73775649, + "learning_rate": 3.9532872948706395e-06, + "loss": 0.76039016, + "num_input_tokens_seen": 17236285, + "step": 806, + "time_per_iteration": 2.7168655395507812 + }, + { + "auxiliary_loss_clip": 0.01220711, + "auxiliary_loss_mlp": 0.01047229, + "balance_loss_clip": 1.06429696, + "balance_loss_mlp": 1.03431857, + "epoch": 0.09703601274574641, + "flos": 17965211103360.0, + "grad_norm": 2.846603981391811, + "language_loss": 0.83077765, + "learning_rate": 3.9531197725811845e-06, + "loss": 0.85345703, + "num_input_tokens_seen": 17251670, + "step": 807, + "time_per_iteration": 2.551278591156006 + }, + { + "auxiliary_loss_clip": 0.01252512, + "auxiliary_loss_mlp": 0.01048074, + "balance_loss_clip": 1.07249331, + "balance_loss_mlp": 1.03608131, + "epoch": 0.0971562556363855, + "flos": 22162162901760.0, + "grad_norm": 2.3128161223949384, + "language_loss": 0.88030207, + "learning_rate": 3.952951954005013e-06, + "loss": 0.90330791, + "num_input_tokens_seen": 17271355, + "step": 808, + "time_per_iteration": 2.4848601818084717 + }, + { + "auxiliary_loss_clip": 0.01216005, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.06033742, + "balance_loss_mlp": 1.0267663, + "epoch": 0.0972764985270246, + "flos": 25848716394240.0, + "grad_norm": 1.8299890768925609, + "language_loss": 0.84859526, + "learning_rate": 3.952783839167584e-06, + "loss": 0.8711338, + "num_input_tokens_seen": 17291400, + "step": 809, + "time_per_iteration": 2.5771279335021973 + }, + { + "auxiliary_loss_clip": 0.01233059, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_clip": 1.06492054, + "balance_loss_mlp": 1.03826046, + "epoch": 0.09739674141766368, + "flos": 20339373375360.0, + "grad_norm": 2.4756381520636106, + "language_loss": 0.74346733, + "learning_rate": 3.952615428094398e-06, + "loss": 0.76630276, + "num_input_tokens_seen": 17310920, + "step": 810, + "time_per_iteration": 2.5418622493743896 + }, + { + "auxiliary_loss_clip": 0.01179405, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.05624688, + "balance_loss_mlp": 1.03253555, + "epoch": 0.09751698430830277, + "flos": 15743059188480.0, + "grad_norm": 1.8843149909741272, + "language_loss": 0.73117381, + "learning_rate": 3.952446720811004e-06, + "loss": 0.75341046, + "num_input_tokens_seen": 17329245, + "step": 811, + "time_per_iteration": 2.5892035961151123 + }, + { + "auxiliary_loss_clip": 0.01098297, + "auxiliary_loss_mlp": 0.01012904, + "balance_loss_clip": 1.02645946, + "balance_loss_mlp": 1.00827909, + "epoch": 0.09763722719894186, + "flos": 63716806800000.0, + "grad_norm": 0.9533060309205846, + "language_loss": 0.63613534, + "learning_rate": 3.952277717342995e-06, + "loss": 0.65724736, + "num_input_tokens_seen": 17395680, + "step": 812, + "time_per_iteration": 3.256331205368042 + }, + { + "auxiliary_loss_clip": 0.01225391, + "auxiliary_loss_mlp": 0.01047311, + "balance_loss_clip": 1.06686258, + "balance_loss_mlp": 1.03509188, + "epoch": 0.09775747008958095, + "flos": 22090916275200.0, + "grad_norm": 3.7191780324000128, + "language_loss": 0.85725152, + "learning_rate": 3.952108417716009e-06, + "loss": 0.87997854, + "num_input_tokens_seen": 17415135, + "step": 813, + "time_per_iteration": 2.5758559703826904 + }, + { + "auxiliary_loss_clip": 0.01238119, + "auxiliary_loss_mlp": 0.01037425, + "balance_loss_clip": 1.06992388, + "balance_loss_mlp": 1.02531981, + "epoch": 0.09787771298022005, + "flos": 21286050272640.0, + "grad_norm": 2.002374886920885, + "language_loss": 0.85238481, + "learning_rate": 3.951938821955727e-06, + "loss": 0.87514025, + "num_input_tokens_seen": 17434535, + "step": 814, + "time_per_iteration": 2.526736259460449 + }, + { + "auxiliary_loss_clip": 0.01220191, + "auxiliary_loss_mlp": 0.01054191, + "balance_loss_clip": 1.06682432, + "balance_loss_mlp": 1.04103017, + "epoch": 0.09799795587085913, + "flos": 22054574689920.0, + "grad_norm": 1.761950191157259, + "language_loss": 0.76614404, + "learning_rate": 3.9517689300878786e-06, + "loss": 0.78888786, + "num_input_tokens_seen": 17454270, + "step": 815, + "time_per_iteration": 2.563603162765503 + }, + { + "auxiliary_loss_clip": 0.0124907, + "auxiliary_loss_mlp": 0.0104743, + "balance_loss_clip": 1.06668615, + "balance_loss_mlp": 1.035074, + "epoch": 0.09811819876149823, + "flos": 22163743100160.0, + "grad_norm": 1.6154304050276231, + "language_loss": 0.78369677, + "learning_rate": 3.951598742138236e-06, + "loss": 0.80666184, + "num_input_tokens_seen": 17472995, + "step": 816, + "time_per_iteration": 2.5028696060180664 + }, + { + "auxiliary_loss_clip": 0.01222801, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.0601058, + "balance_loss_mlp": 1.03222632, + "epoch": 0.09823844165213731, + "flos": 22231111057920.0, + "grad_norm": 2.087037870464484, + "language_loss": 0.79446566, + "learning_rate": 3.951428258132615e-06, + "loss": 0.81713378, + "num_input_tokens_seen": 17491115, + "step": 817, + "time_per_iteration": 2.542043685913086 + }, + { + "auxiliary_loss_clip": 0.01221113, + "auxiliary_loss_mlp": 0.00766135, + "balance_loss_clip": 1.06701481, + "balance_loss_mlp": 1.000049, + "epoch": 0.09835868454277641, + "flos": 22487728798080.0, + "grad_norm": 1.8101882484421883, + "language_loss": 0.84654403, + "learning_rate": 3.951257478096879e-06, + "loss": 0.86641657, + "num_input_tokens_seen": 17509480, + "step": 818, + "time_per_iteration": 2.5610127449035645 + }, + { + "auxiliary_loss_clip": 0.01225898, + "auxiliary_loss_mlp": 0.00767, + "balance_loss_clip": 1.06874037, + "balance_loss_mlp": 1.00004125, + "epoch": 0.0984789274334155, + "flos": 16362554077440.0, + "grad_norm": 11.664055505286736, + "language_loss": 0.68325204, + "learning_rate": 3.951086402056936e-06, + "loss": 0.70318103, + "num_input_tokens_seen": 17524080, + "step": 819, + "time_per_iteration": 2.517002582550049 + }, + { + "auxiliary_loss_clip": 0.01151403, + "auxiliary_loss_mlp": 0.01040122, + "balance_loss_clip": 1.05917859, + "balance_loss_mlp": 1.02807021, + "epoch": 0.09859917032405459, + "flos": 24243545416320.0, + "grad_norm": 1.6781942512696926, + "language_loss": 0.83340466, + "learning_rate": 3.950915030038735e-06, + "loss": 0.85531998, + "num_input_tokens_seen": 17543875, + "step": 820, + "time_per_iteration": 2.673495292663574 + }, + { + "auxiliary_loss_clip": 0.01231708, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.06697381, + "balance_loss_mlp": 1.02834392, + "epoch": 0.09871941321469369, + "flos": 17420195064960.0, + "grad_norm": 2.1538299088470545, + "language_loss": 0.83764315, + "learning_rate": 3.9507433620682765e-06, + "loss": 0.86036384, + "num_input_tokens_seen": 17560810, + "step": 821, + "time_per_iteration": 3.302797794342041 + }, + { + "auxiliary_loss_clip": 0.01204123, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.06205118, + "balance_loss_mlp": 1.03183103, + "epoch": 0.09883965610533277, + "flos": 28477341590400.0, + "grad_norm": 1.8392728428405152, + "language_loss": 0.88368249, + "learning_rate": 3.9505713981716e-06, + "loss": 0.90616655, + "num_input_tokens_seen": 17583640, + "step": 822, + "time_per_iteration": 2.639491319656372 + }, + { + "auxiliary_loss_clip": 0.01217438, + "auxiliary_loss_mlp": 0.01040398, + "balance_loss_clip": 1.06719327, + "balance_loss_mlp": 1.02942514, + "epoch": 0.09895989899597187, + "flos": 23693932437120.0, + "grad_norm": 1.7954747837844185, + "language_loss": 0.8082515, + "learning_rate": 3.950399138374795e-06, + "loss": 0.83082992, + "num_input_tokens_seen": 17602720, + "step": 823, + "time_per_iteration": 2.5586977005004883 + }, + { + "auxiliary_loss_clip": 0.01235169, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_clip": 1.06695068, + "balance_loss_mlp": 1.03111267, + "epoch": 0.09908014188661095, + "flos": 24679608526080.0, + "grad_norm": 1.5718540344801786, + "language_loss": 0.74346745, + "learning_rate": 3.95022658270399e-06, + "loss": 0.76625764, + "num_input_tokens_seen": 17623085, + "step": 824, + "time_per_iteration": 2.5423552989959717 + }, + { + "auxiliary_loss_clip": 0.01218675, + "auxiliary_loss_mlp": 0.01043008, + "balance_loss_clip": 1.06728852, + "balance_loss_mlp": 1.03121233, + "epoch": 0.09920038477725004, + "flos": 14064307200000.0, + "grad_norm": 1.8990821428257043, + "language_loss": 0.78294086, + "learning_rate": 3.9500537311853635e-06, + "loss": 0.80555767, + "num_input_tokens_seen": 17641040, + "step": 825, + "time_per_iteration": 3.282687187194824 + }, + { + "auxiliary_loss_clip": 0.0123229, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.06240439, + "balance_loss_mlp": 1.02661586, + "epoch": 0.09932062766788914, + "flos": 13407070095360.0, + "grad_norm": 2.3387405094626996, + "language_loss": 0.83182621, + "learning_rate": 3.949880583845136e-06, + "loss": 0.854545, + "num_input_tokens_seen": 17659115, + "step": 826, + "time_per_iteration": 2.4942092895507812 + }, + { + "auxiliary_loss_clip": 0.01218665, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.06479549, + "balance_loss_mlp": 1.02437007, + "epoch": 0.09944087055852822, + "flos": 19500751566720.0, + "grad_norm": 1.7635335002744208, + "language_loss": 0.8168053, + "learning_rate": 3.949707140709575e-06, + "loss": 0.83935529, + "num_input_tokens_seen": 17678845, + "step": 827, + "time_per_iteration": 3.41426157951355 + }, + { + "auxiliary_loss_clip": 0.0123503, + "auxiliary_loss_mlp": 0.01041133, + "balance_loss_clip": 1.06363916, + "balance_loss_mlp": 1.02837777, + "epoch": 0.09956111344916732, + "flos": 17749100926080.0, + "grad_norm": 2.188741142924056, + "language_loss": 0.83308917, + "learning_rate": 3.949533401804991e-06, + "loss": 0.85585082, + "num_input_tokens_seen": 17695750, + "step": 828, + "time_per_iteration": 3.250134229660034 + }, + { + "auxiliary_loss_clip": 0.01233639, + "auxiliary_loss_mlp": 0.00766858, + "balance_loss_clip": 1.06704879, + "balance_loss_mlp": 0.99998927, + "epoch": 0.0996813563398064, + "flos": 17967581400960.0, + "grad_norm": 1.9560624122765464, + "language_loss": 0.90871429, + "learning_rate": 3.949359367157739e-06, + "loss": 0.92871928, + "num_input_tokens_seen": 17714445, + "step": 829, + "time_per_iteration": 2.4974539279937744 + }, + { + "auxiliary_loss_clip": 0.01239137, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.06791723, + "balance_loss_mlp": 1.03074765, + "epoch": 0.0998015992304455, + "flos": 17457039440640.0, + "grad_norm": 1.9942286437401784, + "language_loss": 0.75674176, + "learning_rate": 3.949185036794222e-06, + "loss": 0.77956688, + "num_input_tokens_seen": 17732455, + "step": 830, + "time_per_iteration": 2.5000433921813965 + }, + { + "auxiliary_loss_clip": 0.01249127, + "auxiliary_loss_mlp": 0.01044695, + "balance_loss_clip": 1.06883395, + "balance_loss_mlp": 1.03331614, + "epoch": 0.0999218421210846, + "flos": 25888757080320.0, + "grad_norm": 1.6033214206592308, + "language_loss": 0.78983754, + "learning_rate": 3.949010410740884e-06, + "loss": 0.81277573, + "num_input_tokens_seen": 17755280, + "step": 831, + "time_per_iteration": 2.525277853012085 + }, + { + "auxiliary_loss_clip": 0.01208779, + "auxiliary_loss_mlp": 0.0076628, + "balance_loss_clip": 1.06050897, + "balance_loss_mlp": 1.00001538, + "epoch": 0.10004208501172368, + "flos": 21215916967680.0, + "grad_norm": 1.62522100689321, + "language_loss": 0.86340111, + "learning_rate": 3.948835489024216e-06, + "loss": 0.88315165, + "num_input_tokens_seen": 17775015, + "step": 832, + "time_per_iteration": 2.541314125061035 + }, + { + "auxiliary_loss_clip": 0.01236704, + "auxiliary_loss_mlp": 0.01045288, + "balance_loss_clip": 1.06627107, + "balance_loss_mlp": 1.03300381, + "epoch": 0.10016232790236278, + "flos": 17348409734400.0, + "grad_norm": 3.1618027524712726, + "language_loss": 0.90499282, + "learning_rate": 3.948660271670755e-06, + "loss": 0.9278127, + "num_input_tokens_seen": 17792165, + "step": 833, + "time_per_iteration": 2.4987680912017822 + }, + { + "auxiliary_loss_clip": 0.01215297, + "auxiliary_loss_mlp": 0.01044734, + "balance_loss_clip": 1.06385088, + "balance_loss_mlp": 1.03330183, + "epoch": 0.10028257079300186, + "flos": 25666541591040.0, + "grad_norm": 2.5402864867469956, + "language_loss": 0.84196162, + "learning_rate": 3.948484758707079e-06, + "loss": 0.86456198, + "num_input_tokens_seen": 17811765, + "step": 834, + "time_per_iteration": 2.575428009033203 + }, + { + "auxiliary_loss_clip": 0.01193654, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.05822778, + "balance_loss_mlp": 1.02848887, + "epoch": 0.10040281368364096, + "flos": 25156035544320.0, + "grad_norm": 2.205290033562829, + "language_loss": 0.83342886, + "learning_rate": 3.948308950159815e-06, + "loss": 0.85578179, + "num_input_tokens_seen": 17830445, + "step": 835, + "time_per_iteration": 2.605041980743408 + }, + { + "auxiliary_loss_clip": 0.01198919, + "auxiliary_loss_mlp": 0.01047501, + "balance_loss_clip": 1.05968094, + "balance_loss_mlp": 1.03349376, + "epoch": 0.10052305657428004, + "flos": 17603303621760.0, + "grad_norm": 3.343470919223055, + "language_loss": 0.75939095, + "learning_rate": 3.9481328460556326e-06, + "loss": 0.78185511, + "num_input_tokens_seen": 17847665, + "step": 836, + "time_per_iteration": 2.577462911605835 + }, + { + "auxiliary_loss_clip": 0.01209342, + "auxiliary_loss_mlp": 0.01038248, + "balance_loss_clip": 1.06079316, + "balance_loss_mlp": 1.0261606, + "epoch": 0.10064329946491914, + "flos": 18660154510080.0, + "grad_norm": 2.119490816381886, + "language_loss": 0.89630353, + "learning_rate": 3.9479564464212455e-06, + "loss": 0.91877943, + "num_input_tokens_seen": 17866825, + "step": 837, + "time_per_iteration": 2.529484987258911 + }, + { + "auxiliary_loss_clip": 0.01255556, + "auxiliary_loss_mlp": 0.01040527, + "balance_loss_clip": 1.06921005, + "balance_loss_mlp": 1.02846348, + "epoch": 0.10076354235555823, + "flos": 17199056983680.0, + "grad_norm": 2.4916729210920843, + "language_loss": 0.76543212, + "learning_rate": 3.947779751283414e-06, + "loss": 0.7883929, + "num_input_tokens_seen": 17883995, + "step": 838, + "time_per_iteration": 2.4439988136291504 + }, + { + "auxiliary_loss_clip": 0.01237269, + "auxiliary_loss_mlp": 0.00767009, + "balance_loss_clip": 1.07233191, + "balance_loss_mlp": 0.99994469, + "epoch": 0.10088378524619732, + "flos": 22962252395520.0, + "grad_norm": 1.7667242844680333, + "language_loss": 0.76098895, + "learning_rate": 3.947602760668944e-06, + "loss": 0.78103173, + "num_input_tokens_seen": 17903785, + "step": 839, + "time_per_iteration": 2.494786262512207 + }, + { + "auxiliary_loss_clip": 0.012346, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.06970143, + "balance_loss_mlp": 1.03175759, + "epoch": 0.10100402813683641, + "flos": 37885828746240.0, + "grad_norm": 2.005843276947239, + "language_loss": 0.71448809, + "learning_rate": 3.947425474604684e-06, + "loss": 0.73727626, + "num_input_tokens_seen": 17927720, + "step": 840, + "time_per_iteration": 2.651487112045288 + }, + { + "auxiliary_loss_clip": 0.01217405, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.06426644, + "balance_loss_mlp": 1.03670037, + "epoch": 0.1011242710274755, + "flos": 21543458112000.0, + "grad_norm": 2.5419988743685646, + "language_loss": 0.92191446, + "learning_rate": 3.947247893117528e-06, + "loss": 0.94457537, + "num_input_tokens_seen": 17946225, + "step": 841, + "time_per_iteration": 2.5118865966796875 + }, + { + "auxiliary_loss_clip": 0.01230428, + "auxiliary_loss_mlp": 0.01046582, + "balance_loss_clip": 1.06437933, + "balance_loss_mlp": 1.03419077, + "epoch": 0.10124451391811459, + "flos": 13621456419840.0, + "grad_norm": 3.355128122581476, + "language_loss": 0.69190425, + "learning_rate": 3.947070016234413e-06, + "loss": 0.71467435, + "num_input_tokens_seen": 17962015, + "step": 842, + "time_per_iteration": 2.452122688293457 + }, + { + "auxiliary_loss_clip": 0.01229044, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.06816494, + "balance_loss_mlp": 1.02964902, + "epoch": 0.10136475680875369, + "flos": 16649228522880.0, + "grad_norm": 2.8394286288835353, + "language_loss": 0.74950635, + "learning_rate": 3.946891843982326e-06, + "loss": 0.77222323, + "num_input_tokens_seen": 17979680, + "step": 843, + "time_per_iteration": 2.517972707748413 + }, + { + "auxiliary_loss_clip": 0.0123458, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.06779242, + "balance_loss_mlp": 1.02793479, + "epoch": 0.10148499969939277, + "flos": 19461034103040.0, + "grad_norm": 2.3815770918193384, + "language_loss": 0.74619412, + "learning_rate": 3.9467133763882935e-06, + "loss": 0.76894486, + "num_input_tokens_seen": 17998145, + "step": 844, + "time_per_iteration": 2.463911533355713 + }, + { + "auxiliary_loss_clip": 0.01223991, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.06532001, + "balance_loss_mlp": 1.03283191, + "epoch": 0.10160524259003187, + "flos": 21104988791040.0, + "grad_norm": 2.137850562545883, + "language_loss": 0.8633101, + "learning_rate": 3.9465346134793905e-06, + "loss": 0.88600647, + "num_input_tokens_seen": 18017955, + "step": 845, + "time_per_iteration": 2.5580990314483643 + }, + { + "auxiliary_loss_clip": 0.01206699, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.0662179, + "balance_loss_mlp": 1.02924466, + "epoch": 0.10172548548067095, + "flos": 17712687513600.0, + "grad_norm": 1.9391952462209916, + "language_loss": 0.79561269, + "learning_rate": 3.9463555552827335e-06, + "loss": 0.81808794, + "num_input_tokens_seen": 18035125, + "step": 846, + "time_per_iteration": 2.539644718170166 + }, + { + "auxiliary_loss_clip": 0.012232, + "auxiliary_loss_mlp": 0.01048094, + "balance_loss_clip": 1.06467104, + "balance_loss_mlp": 1.03598249, + "epoch": 0.10184572837131005, + "flos": 21104845136640.0, + "grad_norm": 2.3799335297335693, + "language_loss": 0.86475122, + "learning_rate": 3.946176201825487e-06, + "loss": 0.88746417, + "num_input_tokens_seen": 18053160, + "step": 847, + "time_per_iteration": 3.2794134616851807 + }, + { + "auxiliary_loss_clip": 0.01220933, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.06788039, + "balance_loss_mlp": 1.03592002, + "epoch": 0.10196597126194913, + "flos": 26067591918720.0, + "grad_norm": 1.8757728826935736, + "language_loss": 0.83539331, + "learning_rate": 3.9459965531348575e-06, + "loss": 0.85808378, + "num_input_tokens_seen": 18072815, + "step": 848, + "time_per_iteration": 2.5506062507629395 + }, + { + "auxiliary_loss_clip": 0.01220936, + "auxiliary_loss_mlp": 0.00766942, + "balance_loss_clip": 1.06723249, + "balance_loss_mlp": 1.00012016, + "epoch": 0.10208621415258823, + "flos": 29314634595840.0, + "grad_norm": 2.1663592208987756, + "language_loss": 0.85718745, + "learning_rate": 3.945816609238098e-06, + "loss": 0.87706625, + "num_input_tokens_seen": 18092225, + "step": 849, + "time_per_iteration": 2.592404842376709 + }, + { + "auxiliary_loss_clip": 0.01180991, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.06252837, + "balance_loss_mlp": 1.0308001, + "epoch": 0.10220645704322733, + "flos": 23805794367360.0, + "grad_norm": 2.5953664706914634, + "language_loss": 0.85040855, + "learning_rate": 3.945636370162507e-06, + "loss": 0.8726536, + "num_input_tokens_seen": 18112335, + "step": 850, + "time_per_iteration": 2.6394407749176025 + }, + { + "auxiliary_loss_clip": 0.01231576, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.0672524, + "balance_loss_mlp": 1.03615785, + "epoch": 0.10232669993386641, + "flos": 23218546913280.0, + "grad_norm": 2.0973420090993704, + "language_loss": 0.79131997, + "learning_rate": 3.945455835935425e-06, + "loss": 0.81411076, + "num_input_tokens_seen": 18131520, + "step": 851, + "time_per_iteration": 2.672485589981079 + }, + { + "auxiliary_loss_clip": 0.01220252, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.06529629, + "balance_loss_mlp": 1.03484726, + "epoch": 0.1024469428245055, + "flos": 22922929981440.0, + "grad_norm": 2.166325375407431, + "language_loss": 0.75056547, + "learning_rate": 3.94527500658424e-06, + "loss": 0.77323461, + "num_input_tokens_seen": 18149185, + "step": 852, + "time_per_iteration": 3.268260955810547 + }, + { + "auxiliary_loss_clip": 0.01188679, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.06291997, + "balance_loss_mlp": 1.02733111, + "epoch": 0.10256718571514459, + "flos": 31359495957120.0, + "grad_norm": 1.9073348344713514, + "language_loss": 0.8100422, + "learning_rate": 3.945093882136382e-06, + "loss": 0.83231461, + "num_input_tokens_seen": 18172960, + "step": 853, + "time_per_iteration": 3.671428918838501 + }, + { + "auxiliary_loss_clip": 0.01216055, + "auxiliary_loss_mlp": 0.00765419, + "balance_loss_clip": 1.06689453, + "balance_loss_mlp": 1.00006795, + "epoch": 0.10268742860578368, + "flos": 23474877344640.0, + "grad_norm": 2.0369046488082856, + "language_loss": 0.84509921, + "learning_rate": 3.944912462619329e-06, + "loss": 0.86491388, + "num_input_tokens_seen": 18191925, + "step": 854, + "time_per_iteration": 2.6228551864624023 + }, + { + "auxiliary_loss_clip": 0.01221666, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.06585908, + "balance_loss_mlp": 1.03633189, + "epoch": 0.10280767149642277, + "flos": 25520313323520.0, + "grad_norm": 3.404673092178469, + "language_loss": 0.80233961, + "learning_rate": 3.9447307480606025e-06, + "loss": 0.82504815, + "num_input_tokens_seen": 18212010, + "step": 855, + "time_per_iteration": 3.3405556678771973 + }, + { + "auxiliary_loss_clip": 0.01214746, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_clip": 1.06535614, + "balance_loss_mlp": 1.03015125, + "epoch": 0.10292791438706186, + "flos": 17347691462400.0, + "grad_norm": 3.003890064888439, + "language_loss": 0.89972758, + "learning_rate": 3.944548738487767e-06, + "loss": 0.92230725, + "num_input_tokens_seen": 18229525, + "step": 856, + "time_per_iteration": 2.5274417400360107 + }, + { + "auxiliary_loss_clip": 0.01257635, + "auxiliary_loss_mlp": 0.01044011, + "balance_loss_clip": 1.07388747, + "balance_loss_mlp": 1.03259122, + "epoch": 0.10304815727770096, + "flos": 27052693390080.0, + "grad_norm": 2.100832403580187, + "language_loss": 0.90825057, + "learning_rate": 3.944366433928434e-06, + "loss": 0.93126702, + "num_input_tokens_seen": 18249505, + "step": 857, + "time_per_iteration": 2.5308327674865723 + }, + { + "auxiliary_loss_clip": 0.01213004, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.06232917, + "balance_loss_mlp": 1.03597975, + "epoch": 0.10316840016834004, + "flos": 22782591544320.0, + "grad_norm": 2.237102036843472, + "language_loss": 0.83802682, + "learning_rate": 3.9441838344102594e-06, + "loss": 0.86063504, + "num_input_tokens_seen": 18269230, + "step": 858, + "time_per_iteration": 2.627354860305786 + }, + { + "auxiliary_loss_clip": 0.01226715, + "auxiliary_loss_mlp": 0.01042823, + "balance_loss_clip": 1.06885815, + "balance_loss_mlp": 1.03116405, + "epoch": 0.10328864305897914, + "flos": 20704584908160.0, + "grad_norm": 3.303488850851589, + "language_loss": 0.67042291, + "learning_rate": 3.944000939960943e-06, + "loss": 0.69311827, + "num_input_tokens_seen": 18287955, + "step": 859, + "time_per_iteration": 2.5827934741973877 + }, + { + "auxiliary_loss_clip": 0.01237978, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.06616306, + "balance_loss_mlp": 1.02867961, + "epoch": 0.10340888594961822, + "flos": 28478814048000.0, + "grad_norm": 6.370256847344052, + "language_loss": 0.80057919, + "learning_rate": 3.943817750608229e-06, + "loss": 0.8233512, + "num_input_tokens_seen": 18310505, + "step": 860, + "time_per_iteration": 2.584465265274048 + }, + { + "auxiliary_loss_clip": 0.01241111, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.07246232, + "balance_loss_mlp": 1.0286541, + "epoch": 0.10352912884025732, + "flos": 13370333460480.0, + "grad_norm": 2.175762921988747, + "language_loss": 0.81847197, + "learning_rate": 3.943634266379908e-06, + "loss": 0.84128356, + "num_input_tokens_seen": 18327400, + "step": 861, + "time_per_iteration": 2.47428822517395 + }, + { + "auxiliary_loss_clip": 0.01237188, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.06687546, + "balance_loss_mlp": 1.02873671, + "epoch": 0.10364937173089642, + "flos": 25558558329600.0, + "grad_norm": 1.8234020500404189, + "language_loss": 0.84948528, + "learning_rate": 3.943450487303815e-06, + "loss": 0.87225908, + "num_input_tokens_seen": 18347895, + "step": 862, + "time_per_iteration": 2.5316033363342285 + }, + { + "auxiliary_loss_clip": 0.01232499, + "auxiliary_loss_mlp": 0.01039847, + "balance_loss_clip": 1.06812179, + "balance_loss_mlp": 1.02818191, + "epoch": 0.1037696146215355, + "flos": 21215486004480.0, + "grad_norm": 1.7833454724004991, + "language_loss": 0.85031354, + "learning_rate": 3.943266413407827e-06, + "loss": 0.87303698, + "num_input_tokens_seen": 18367170, + "step": 863, + "time_per_iteration": 2.4865143299102783 + }, + { + "auxiliary_loss_clip": 0.01237765, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.07003617, + "balance_loss_mlp": 1.03169918, + "epoch": 0.1038898575121746, + "flos": 25807382818560.0, + "grad_norm": 1.8825425416363033, + "language_loss": 0.84739596, + "learning_rate": 3.94308204471987e-06, + "loss": 0.87020719, + "num_input_tokens_seen": 18386185, + "step": 864, + "time_per_iteration": 2.539384365081787 + }, + { + "auxiliary_loss_clip": 0.01205002, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.06376529, + "balance_loss_mlp": 1.02414763, + "epoch": 0.10401010040281368, + "flos": 19062425900160.0, + "grad_norm": 3.319257796739146, + "language_loss": 0.7492885, + "learning_rate": 3.942897381267912e-06, + "loss": 0.77169847, + "num_input_tokens_seen": 18402550, + "step": 865, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01243042, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.07225144, + "balance_loss_mlp": 1.02283382, + "epoch": 0.10413034329345278, + "flos": 16355119962240.0, + "grad_norm": 2.7712562649215355, + "language_loss": 0.65953261, + "learning_rate": 3.942712423079965e-06, + "loss": 0.68230605, + "num_input_tokens_seen": 18418940, + "step": 866, + "time_per_iteration": 2.4790189266204834 + }, + { + "auxiliary_loss_clip": 0.01184209, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.0545032, + "balance_loss_mlp": 1.02528405, + "epoch": 0.10425058618409186, + "flos": 17236511890560.0, + "grad_norm": 2.526350744957532, + "language_loss": 0.90156156, + "learning_rate": 3.942527170184088e-06, + "loss": 0.92375827, + "num_input_tokens_seen": 18435560, + "step": 867, + "time_per_iteration": 2.5399932861328125 + }, + { + "auxiliary_loss_clip": 0.01253517, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.07183504, + "balance_loss_mlp": 1.0294168, + "epoch": 0.10437082907473096, + "flos": 17967365919360.0, + "grad_norm": 2.264853869802826, + "language_loss": 0.77554595, + "learning_rate": 3.942341622608385e-06, + "loss": 0.79849333, + "num_input_tokens_seen": 18452590, + "step": 868, + "time_per_iteration": 2.451336145401001 + }, + { + "auxiliary_loss_clip": 0.01222189, + "auxiliary_loss_mlp": 0.01043724, + "balance_loss_clip": 1.0691812, + "balance_loss_mlp": 1.03211904, + "epoch": 0.10449107196537005, + "flos": 36283315374720.0, + "grad_norm": 2.001293168578348, + "language_loss": 0.78016436, + "learning_rate": 3.942155780381001e-06, + "loss": 0.80282348, + "num_input_tokens_seen": 18476325, + "step": 869, + "time_per_iteration": 2.6595215797424316 + }, + { + "auxiliary_loss_clip": 0.01220711, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.06374049, + "balance_loss_mlp": 1.03341126, + "epoch": 0.10461131485600914, + "flos": 23802095266560.0, + "grad_norm": 1.9106085158732284, + "language_loss": 0.75781316, + "learning_rate": 3.94196964353013e-06, + "loss": 0.78047997, + "num_input_tokens_seen": 18495775, + "step": 870, + "time_per_iteration": 2.5559327602386475 + }, + { + "auxiliary_loss_clip": 0.01216956, + "auxiliary_loss_mlp": 0.00765485, + "balance_loss_clip": 1.06374252, + "balance_loss_mlp": 1.00012648, + "epoch": 0.10473155774664823, + "flos": 18405476104320.0, + "grad_norm": 1.9737760200382524, + "language_loss": 0.80670154, + "learning_rate": 3.941783212084008e-06, + "loss": 0.82652593, + "num_input_tokens_seen": 18513530, + "step": 871, + "time_per_iteration": 2.5382020473480225 + }, + { + "auxiliary_loss_clip": 0.01205355, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_clip": 1.06511593, + "balance_loss_mlp": 1.03271246, + "epoch": 0.10485180063728732, + "flos": 25592637358080.0, + "grad_norm": 2.4557297627007997, + "language_loss": 0.79317546, + "learning_rate": 3.941596486070916e-06, + "loss": 0.81567836, + "num_input_tokens_seen": 18531575, + "step": 872, + "time_per_iteration": 2.588190793991089 + }, + { + "auxiliary_loss_clip": 0.01186879, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.06414151, + "balance_loss_mlp": 1.02384257, + "epoch": 0.10497204352792641, + "flos": 27088747666560.0, + "grad_norm": 2.1412634343711154, + "language_loss": 0.58533347, + "learning_rate": 3.941409465519182e-06, + "loss": 0.60756838, + "num_input_tokens_seen": 18552100, + "step": 873, + "time_per_iteration": 2.6472792625427246 + }, + { + "auxiliary_loss_clip": 0.0122552, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_clip": 1.06407785, + "balance_loss_mlp": 1.03165078, + "epoch": 0.10509228641856551, + "flos": 32858479353600.0, + "grad_norm": 1.6447512575985188, + "language_loss": 0.85198975, + "learning_rate": 3.941222150457176e-06, + "loss": 0.87469304, + "num_input_tokens_seen": 18575355, + "step": 874, + "time_per_iteration": 3.423719882965088 + }, + { + "auxiliary_loss_clip": 0.01240177, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.0676465, + "balance_loss_mlp": 1.02667212, + "epoch": 0.10521252930920459, + "flos": 14319165173760.0, + "grad_norm": 2.8463727202923286, + "language_loss": 0.71149546, + "learning_rate": 3.941034540913311e-06, + "loss": 0.734285, + "num_input_tokens_seen": 18592885, + "step": 875, + "time_per_iteration": 2.475145101547241 + }, + { + "auxiliary_loss_clip": 0.01236973, + "auxiliary_loss_mlp": 0.00766624, + "balance_loss_clip": 1.07008505, + "balance_loss_mlp": 1.00016022, + "epoch": 0.10533277219984369, + "flos": 21687028773120.0, + "grad_norm": 1.5962355853127843, + "language_loss": 0.82411563, + "learning_rate": 3.940846636916051e-06, + "loss": 0.84415162, + "num_input_tokens_seen": 18612920, + "step": 876, + "time_per_iteration": 2.5428812503814697 + }, + { + "auxiliary_loss_clip": 0.01220273, + "auxiliary_loss_mlp": 0.01048191, + "balance_loss_clip": 1.07053804, + "balance_loss_mlp": 1.03508949, + "epoch": 0.10545301509048277, + "flos": 22269787027200.0, + "grad_norm": 2.0829580387345055, + "language_loss": 0.86610603, + "learning_rate": 3.940658438493899e-06, + "loss": 0.88879067, + "num_input_tokens_seen": 18630765, + "step": 877, + "time_per_iteration": 2.545813798904419 + }, + { + "auxiliary_loss_clip": 0.01254124, + "auxiliary_loss_mlp": 0.01042168, + "balance_loss_clip": 1.06617868, + "balance_loss_mlp": 1.0295136, + "epoch": 0.10557325798112187, + "flos": 22199725549440.0, + "grad_norm": 2.1651349502749775, + "language_loss": 0.76163447, + "learning_rate": 3.940469945675405e-06, + "loss": 0.7845974, + "num_input_tokens_seen": 18649150, + "step": 878, + "time_per_iteration": 3.246293067932129 + }, + { + "auxiliary_loss_clip": 0.01166553, + "auxiliary_loss_mlp": 0.01045867, + "balance_loss_clip": 1.05816996, + "balance_loss_mlp": 1.03444648, + "epoch": 0.10569350087176095, + "flos": 25775889569280.0, + "grad_norm": 2.023790246003973, + "language_loss": 0.91725224, + "learning_rate": 3.940281158489163e-06, + "loss": 0.93937647, + "num_input_tokens_seen": 18668380, + "step": 879, + "time_per_iteration": 2.630768060684204 + }, + { + "auxiliary_loss_clip": 0.01168387, + "auxiliary_loss_mlp": 0.01044457, + "balance_loss_clip": 1.0555619, + "balance_loss_mlp": 1.03275084, + "epoch": 0.10581374376240005, + "flos": 17311385790720.0, + "grad_norm": 7.253074076075061, + "language_loss": 0.82890636, + "learning_rate": 3.940092076963812e-06, + "loss": 0.85103476, + "num_input_tokens_seen": 18685875, + "step": 880, + "time_per_iteration": 3.491748809814453 + }, + { + "auxiliary_loss_clip": 0.01216041, + "auxiliary_loss_mlp": 0.01046981, + "balance_loss_clip": 1.06327569, + "balance_loss_mlp": 1.03451133, + "epoch": 0.10593398665303914, + "flos": 34349454017280.0, + "grad_norm": 2.201171697155286, + "language_loss": 0.78926682, + "learning_rate": 3.9399027011280355e-06, + "loss": 0.81189704, + "num_input_tokens_seen": 18707970, + "step": 881, + "time_per_iteration": 2.6801040172576904 + }, + { + "auxiliary_loss_clip": 0.0121879, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.06817174, + "balance_loss_mlp": 1.02867508, + "epoch": 0.10605422954367823, + "flos": 23257977068160.0, + "grad_norm": 2.1336259516275486, + "language_loss": 0.77490532, + "learning_rate": 3.939713031010561e-06, + "loss": 0.7975015, + "num_input_tokens_seen": 18726335, + "step": 882, + "time_per_iteration": 3.284637451171875 + }, + { + "auxiliary_loss_clip": 0.0120143, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_clip": 1.06556129, + "balance_loss_mlp": 1.02910328, + "epoch": 0.10617447243431732, + "flos": 22820118278400.0, + "grad_norm": 2.3591702407021558, + "language_loss": 0.77737093, + "learning_rate": 3.939523066640163e-06, + "loss": 0.7998091, + "num_input_tokens_seen": 18745230, + "step": 883, + "time_per_iteration": 2.578439474105835 + }, + { + "auxiliary_loss_clip": 0.01237568, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.06838441, + "balance_loss_mlp": 1.02965486, + "epoch": 0.10629471532495641, + "flos": 24386577373440.0, + "grad_norm": 1.8746076137111203, + "language_loss": 0.81338352, + "learning_rate": 3.939332808045657e-06, + "loss": 0.83617419, + "num_input_tokens_seen": 18764880, + "step": 884, + "time_per_iteration": 2.5560302734375 + }, + { + "auxiliary_loss_clip": 0.0120491, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.0650779, + "balance_loss_mlp": 1.03197157, + "epoch": 0.1064149582155955, + "flos": 21105491581440.0, + "grad_norm": 1.6841616910548782, + "language_loss": 0.84690976, + "learning_rate": 3.939142255255906e-06, + "loss": 0.86939502, + "num_input_tokens_seen": 18785765, + "step": 885, + "time_per_iteration": 2.6163365840911865 + }, + { + "auxiliary_loss_clip": 0.01236461, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.06873155, + "balance_loss_mlp": 1.02271545, + "epoch": 0.1065352011062346, + "flos": 20702035042560.0, + "grad_norm": 2.3705089487107935, + "language_loss": 0.86455911, + "learning_rate": 3.938951408299817e-06, + "loss": 0.88727641, + "num_input_tokens_seen": 18804605, + "step": 886, + "time_per_iteration": 2.5161798000335693 + }, + { + "auxiliary_loss_clip": 0.01107957, + "auxiliary_loss_mlp": 0.01022806, + "balance_loss_clip": 1.05832243, + "balance_loss_mlp": 1.01918221, + "epoch": 0.10665544399687368, + "flos": 62659632689280.0, + "grad_norm": 0.8062078229451766, + "language_loss": 0.5441612, + "learning_rate": 3.938760267206342e-06, + "loss": 0.56546885, + "num_input_tokens_seen": 18866425, + "step": 887, + "time_per_iteration": 3.104126214981079 + }, + { + "auxiliary_loss_clip": 0.01250889, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.0702492, + "balance_loss_mlp": 1.02737117, + "epoch": 0.10677568688751278, + "flos": 26140382830080.0, + "grad_norm": 2.2213090147609504, + "language_loss": 0.79058969, + "learning_rate": 3.938568832004475e-06, + "loss": 0.81349224, + "num_input_tokens_seen": 18885130, + "step": 888, + "time_per_iteration": 2.5349457263946533 + }, + { + "auxiliary_loss_clip": 0.0121022, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.06313348, + "balance_loss_mlp": 1.03900731, + "epoch": 0.10689592977815186, + "flos": 12786533712000.0, + "grad_norm": 2.250505992968594, + "language_loss": 0.75351048, + "learning_rate": 3.938377102723257e-06, + "loss": 0.77612913, + "num_input_tokens_seen": 18902265, + "step": 889, + "time_per_iteration": 2.5150461196899414 + }, + { + "auxiliary_loss_clip": 0.01171473, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.05809522, + "balance_loss_mlp": 1.03889275, + "epoch": 0.10701617266879096, + "flos": 22126683242880.0, + "grad_norm": 2.0272943104641965, + "language_loss": 0.8326872, + "learning_rate": 3.938185079391774e-06, + "loss": 0.85492098, + "num_input_tokens_seen": 18919310, + "step": 890, + "time_per_iteration": 2.6090900897979736 + }, + { + "auxiliary_loss_clip": 0.01250107, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.06854534, + "balance_loss_mlp": 1.02387333, + "epoch": 0.10713641555943004, + "flos": 19745625559680.0, + "grad_norm": 2.4502619689593828, + "language_loss": 1.05944252, + "learning_rate": 3.937992762039157e-06, + "loss": 1.08230042, + "num_input_tokens_seen": 18932635, + "step": 891, + "time_per_iteration": 2.4560630321502686 + }, + { + "auxiliary_loss_clip": 0.01232828, + "auxiliary_loss_mlp": 0.01049007, + "balance_loss_clip": 1.06847763, + "balance_loss_mlp": 1.03724122, + "epoch": 0.10725665845006914, + "flos": 23952992302080.0, + "grad_norm": 1.5648784372640385, + "language_loss": 0.8026191, + "learning_rate": 3.937800150694577e-06, + "loss": 0.82543743, + "num_input_tokens_seen": 18953810, + "step": 892, + "time_per_iteration": 2.628404378890991 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_clip": 1.06502533, + "balance_loss_mlp": 1.02937186, + "epoch": 0.10737690134070824, + "flos": 18551704371840.0, + "grad_norm": 2.125893412374179, + "language_loss": 0.75950187, + "learning_rate": 3.937607245387255e-06, + "loss": 0.78180915, + "num_input_tokens_seen": 18973175, + "step": 893, + "time_per_iteration": 2.6575801372528076 + }, + { + "auxiliary_loss_clip": 0.01226172, + "auxiliary_loss_mlp": 0.01046969, + "balance_loss_clip": 1.06583548, + "balance_loss_mlp": 1.03556085, + "epoch": 0.10749714423134732, + "flos": 22707609903360.0, + "grad_norm": 1.8954759059579405, + "language_loss": 0.72281992, + "learning_rate": 3.937414046146455e-06, + "loss": 0.74555135, + "num_input_tokens_seen": 18991130, + "step": 894, + "time_per_iteration": 2.5614848136901855 + }, + { + "auxiliary_loss_clip": 0.01252668, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.07163644, + "balance_loss_mlp": 1.03790641, + "epoch": 0.10761738712198642, + "flos": 21106066199040.0, + "grad_norm": 2.4041544553142336, + "language_loss": 0.75544989, + "learning_rate": 3.9372205530014845e-06, + "loss": 0.77848828, + "num_input_tokens_seen": 19009610, + "step": 895, + "time_per_iteration": 2.501729965209961 + }, + { + "auxiliary_loss_clip": 0.0124967, + "auxiliary_loss_mlp": 0.01057419, + "balance_loss_clip": 1.06933403, + "balance_loss_mlp": 1.04577255, + "epoch": 0.1077376300126255, + "flos": 23766723348480.0, + "grad_norm": 3.3340744936338242, + "language_loss": 0.7168293, + "learning_rate": 3.937026765981696e-06, + "loss": 0.73990023, + "num_input_tokens_seen": 19029680, + "step": 896, + "time_per_iteration": 2.5032219886779785 + }, + { + "auxiliary_loss_clip": 0.01209457, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.06693029, + "balance_loss_mlp": 1.03682637, + "epoch": 0.1078578729032646, + "flos": 20919581763840.0, + "grad_norm": 1.944272017540537, + "language_loss": 0.79521108, + "learning_rate": 3.936832685116488e-06, + "loss": 0.81779563, + "num_input_tokens_seen": 19047775, + "step": 897, + "time_per_iteration": 2.5997555255889893 + }, + { + "auxiliary_loss_clip": 0.01250602, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0701344, + "balance_loss_mlp": 1.03820968, + "epoch": 0.10797811579390369, + "flos": 14829886702080.0, + "grad_norm": 2.2601872502838085, + "language_loss": 0.90124583, + "learning_rate": 3.936638310435301e-06, + "loss": 0.92425281, + "num_input_tokens_seen": 19065640, + "step": 898, + "time_per_iteration": 2.476712226867676 + }, + { + "auxiliary_loss_clip": 0.01238517, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.06957984, + "balance_loss_mlp": 1.02962112, + "epoch": 0.10809835868454278, + "flos": 19536985411200.0, + "grad_norm": 1.8516766769738098, + "language_loss": 0.81500447, + "learning_rate": 3.936443641967623e-06, + "loss": 0.83780915, + "num_input_tokens_seen": 19084470, + "step": 899, + "time_per_iteration": 2.5221753120422363 + }, + { + "auxiliary_loss_clip": 0.01220105, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.06801021, + "balance_loss_mlp": 1.03463709, + "epoch": 0.10821860157518187, + "flos": 18442320480000.0, + "grad_norm": 3.7749681671373114, + "language_loss": 0.83078492, + "learning_rate": 3.936248679742983e-06, + "loss": 0.8534559, + "num_input_tokens_seen": 19102965, + "step": 900, + "time_per_iteration": 3.4695215225219727 + }, + { + "auxiliary_loss_clip": 0.01095446, + "auxiliary_loss_mlp": 0.01046967, + "balance_loss_clip": 1.03037715, + "balance_loss_mlp": 1.0428896, + "epoch": 0.10833884446582095, + "flos": 49359468447360.0, + "grad_norm": 1.059869429396583, + "language_loss": 0.70163828, + "learning_rate": 3.936053423790959e-06, + "loss": 0.7230624, + "num_input_tokens_seen": 19151285, + "step": 901, + "time_per_iteration": 2.9060444831848145 + }, + { + "auxiliary_loss_clip": 0.01250976, + "auxiliary_loss_mlp": 0.01053356, + "balance_loss_clip": 1.07237124, + "balance_loss_mlp": 1.04193604, + "epoch": 0.10845908735646005, + "flos": 20411912891520.0, + "grad_norm": 1.9478997185895697, + "language_loss": 0.77381372, + "learning_rate": 3.935857874141168e-06, + "loss": 0.79685712, + "num_input_tokens_seen": 19170120, + "step": 902, + "time_per_iteration": 2.510826587677002 + }, + { + "auxiliary_loss_clip": 0.01211105, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.06463456, + "balance_loss_mlp": 1.02439713, + "epoch": 0.10857933024709913, + "flos": 14027750133120.0, + "grad_norm": 2.0829838065274995, + "language_loss": 0.83508182, + "learning_rate": 3.935662030823279e-06, + "loss": 0.85756397, + "num_input_tokens_seen": 19186305, + "step": 903, + "time_per_iteration": 2.5632071495056152 + }, + { + "auxiliary_loss_clip": 0.01232649, + "auxiliary_loss_mlp": 0.01047161, + "balance_loss_clip": 1.06551313, + "balance_loss_mlp": 1.035604, + "epoch": 0.10869957313773823, + "flos": 13369004657280.0, + "grad_norm": 2.153745957717676, + "language_loss": 0.7304405, + "learning_rate": 3.935465893866998e-06, + "loss": 0.75323868, + "num_input_tokens_seen": 19204530, + "step": 904, + "time_per_iteration": 2.532444953918457 + }, + { + "auxiliary_loss_clip": 0.01221785, + "auxiliary_loss_mlp": 0.01041092, + "balance_loss_clip": 1.06830382, + "balance_loss_mlp": 1.02881348, + "epoch": 0.10881981602837733, + "flos": 25807095509760.0, + "grad_norm": 2.131759386209718, + "language_loss": 0.80578297, + "learning_rate": 3.935269463302079e-06, + "loss": 0.8284117, + "num_input_tokens_seen": 19222735, + "step": 905, + "time_per_iteration": 3.355724334716797 + }, + { + "auxiliary_loss_clip": 0.0123828, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.06912005, + "balance_loss_mlp": 1.03218603, + "epoch": 0.10894005891901641, + "flos": 20777555387520.0, + "grad_norm": 1.9055713645301335, + "language_loss": 0.76770175, + "learning_rate": 3.935072739158322e-06, + "loss": 0.79053187, + "num_input_tokens_seen": 19242445, + "step": 906, + "time_per_iteration": 2.534024477005005 + }, + { + "auxiliary_loss_clip": 0.01221895, + "auxiliary_loss_mlp": 0.01046419, + "balance_loss_clip": 1.0669359, + "balance_loss_mlp": 1.03391957, + "epoch": 0.10906030180965551, + "flos": 26649883296000.0, + "grad_norm": 1.666361891444486, + "language_loss": 0.7970683, + "learning_rate": 3.934875721465569e-06, + "loss": 0.81975144, + "num_input_tokens_seen": 19262865, + "step": 907, + "time_per_iteration": 3.5007591247558594 + }, + { + "auxiliary_loss_clip": 0.0121226, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.06006896, + "balance_loss_mlp": 1.0268625, + "epoch": 0.10918054470029459, + "flos": 36534402420480.0, + "grad_norm": 2.2352780812368684, + "language_loss": 0.72035009, + "learning_rate": 3.9346784102537076e-06, + "loss": 0.74287027, + "num_input_tokens_seen": 19285000, + "step": 908, + "time_per_iteration": 3.4692440032958984 + }, + { + "auxiliary_loss_clip": 0.01247544, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.06801498, + "balance_loss_mlp": 1.02451789, + "epoch": 0.10930078759093369, + "flos": 21762549118080.0, + "grad_norm": 2.152585757675597, + "language_loss": 0.78707099, + "learning_rate": 3.934480805552669e-06, + "loss": 0.80990702, + "num_input_tokens_seen": 19306010, + "step": 909, + "time_per_iteration": 2.4838972091674805 + }, + { + "auxiliary_loss_clip": 0.01247914, + "auxiliary_loss_mlp": 0.00766377, + "balance_loss_clip": 1.06895864, + "balance_loss_mlp": 1.00022078, + "epoch": 0.10942103048157277, + "flos": 22601781457920.0, + "grad_norm": 2.239238319449174, + "language_loss": 0.87944019, + "learning_rate": 3.93428290739243e-06, + "loss": 0.8995831, + "num_input_tokens_seen": 19325380, + "step": 910, + "time_per_iteration": 2.55458927154541 + }, + { + "auxiliary_loss_clip": 0.0121816, + "auxiliary_loss_mlp": 0.01043011, + "balance_loss_clip": 1.06523955, + "balance_loss_mlp": 1.03058898, + "epoch": 0.10954127337221187, + "flos": 15045781397760.0, + "grad_norm": 2.6231745476004176, + "language_loss": 0.79772121, + "learning_rate": 3.9340847158030125e-06, + "loss": 0.82033288, + "num_input_tokens_seen": 19338960, + "step": 911, + "time_per_iteration": 2.498889684677124 + }, + { + "auxiliary_loss_clip": 0.01232069, + "auxiliary_loss_mlp": 0.01049271, + "balance_loss_clip": 1.06386662, + "balance_loss_mlp": 1.03742158, + "epoch": 0.10966151626285096, + "flos": 21650974496640.0, + "grad_norm": 1.792408265944816, + "language_loss": 0.75346315, + "learning_rate": 3.9338862308144814e-06, + "loss": 0.77627653, + "num_input_tokens_seen": 19357780, + "step": 912, + "time_per_iteration": 2.5177719593048096 + }, + { + "auxiliary_loss_clip": 0.01246908, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_clip": 1.06724405, + "balance_loss_mlp": 1.03180289, + "epoch": 0.10978175915349005, + "flos": 20121359777280.0, + "grad_norm": 2.077854498788477, + "language_loss": 0.84350652, + "learning_rate": 3.933687452456946e-06, + "loss": 0.86641765, + "num_input_tokens_seen": 19377680, + "step": 913, + "time_per_iteration": 2.493380069732666 + }, + { + "auxiliary_loss_clip": 0.01198801, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.05911553, + "balance_loss_mlp": 1.02928543, + "epoch": 0.10990200204412914, + "flos": 20412667077120.0, + "grad_norm": 5.516580276220256, + "language_loss": 0.86608702, + "learning_rate": 3.933488380760562e-06, + "loss": 0.88849753, + "num_input_tokens_seen": 19397040, + "step": 914, + "time_per_iteration": 2.588554859161377 + }, + { + "auxiliary_loss_clip": 0.01246548, + "auxiliary_loss_mlp": 0.00767264, + "balance_loss_clip": 1.06695592, + "balance_loss_mlp": 1.00021315, + "epoch": 0.11002224493476823, + "flos": 17530117660800.0, + "grad_norm": 1.925260600445307, + "language_loss": 0.87039983, + "learning_rate": 3.9332890157555286e-06, + "loss": 0.89053786, + "num_input_tokens_seen": 19413975, + "step": 915, + "time_per_iteration": 2.531214952468872 + }, + { + "auxiliary_loss_clip": 0.01222129, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_clip": 1.06563783, + "balance_loss_mlp": 1.03213167, + "epoch": 0.11014248782540732, + "flos": 12203093099520.0, + "grad_norm": 1.7937093139015061, + "language_loss": 0.76440042, + "learning_rate": 3.933089357472088e-06, + "loss": 0.78706336, + "num_input_tokens_seen": 19432005, + "step": 916, + "time_per_iteration": 2.5258312225341797 + }, + { + "auxiliary_loss_clip": 0.01244837, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.06782269, + "balance_loss_mlp": 1.02932549, + "epoch": 0.11026273071604642, + "flos": 22382977760640.0, + "grad_norm": 2.740617401560644, + "language_loss": 0.85856855, + "learning_rate": 3.932889405940529e-06, + "loss": 0.88142288, + "num_input_tokens_seen": 19450100, + "step": 917, + "time_per_iteration": 2.4750916957855225 + }, + { + "auxiliary_loss_clip": 0.01219534, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.0698576, + "balance_loss_mlp": 1.03369641, + "epoch": 0.1103829736066855, + "flos": 19829046896640.0, + "grad_norm": 2.430533758463773, + "language_loss": 0.79831803, + "learning_rate": 3.932689161191184e-06, + "loss": 0.82096469, + "num_input_tokens_seen": 19467805, + "step": 918, + "time_per_iteration": 2.524777889251709 + }, + { + "auxiliary_loss_clip": 0.01229415, + "auxiliary_loss_mlp": 0.01046226, + "balance_loss_clip": 1.06445098, + "balance_loss_mlp": 1.03371453, + "epoch": 0.1105032164973246, + "flos": 22669616292480.0, + "grad_norm": 2.0472912761321327, + "language_loss": 0.88172996, + "learning_rate": 3.93248862325443e-06, + "loss": 0.90448642, + "num_input_tokens_seen": 19486710, + "step": 919, + "time_per_iteration": 2.501171112060547 + }, + { + "auxiliary_loss_clip": 0.01133369, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.03769565, + "balance_loss_mlp": 1.02213085, + "epoch": 0.11062345938796368, + "flos": 66483507876480.0, + "grad_norm": 0.9466168740475717, + "language_loss": 0.64495075, + "learning_rate": 3.932287792160688e-06, + "loss": 0.66654587, + "num_input_tokens_seen": 19545170, + "step": 920, + "time_per_iteration": 2.9975528717041016 + }, + { + "auxiliary_loss_clip": 0.01234142, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.06399441, + "balance_loss_mlp": 1.02985084, + "epoch": 0.11074370227860278, + "flos": 21907771804800.0, + "grad_norm": 2.3951888163336976, + "language_loss": 0.80806577, + "learning_rate": 3.932086667940424e-06, + "loss": 0.83083957, + "num_input_tokens_seen": 19561875, + "step": 921, + "time_per_iteration": 2.5301268100738525 + }, + { + "auxiliary_loss_clip": 0.01230691, + "auxiliary_loss_mlp": 0.00766549, + "balance_loss_clip": 1.06736255, + "balance_loss_mlp": 1.0002768, + "epoch": 0.11086394516924186, + "flos": 28658115763200.0, + "grad_norm": 1.9402681870680323, + "language_loss": 0.81970757, + "learning_rate": 3.93188525062415e-06, + "loss": 0.83967996, + "num_input_tokens_seen": 19582340, + "step": 922, + "time_per_iteration": 2.5770580768585205 + }, + { + "auxiliary_loss_clip": 0.01231217, + "auxiliary_loss_mlp": 0.01052854, + "balance_loss_clip": 1.06536674, + "balance_loss_mlp": 1.04053402, + "epoch": 0.11098418805988096, + "flos": 24535247765760.0, + "grad_norm": 1.997487286234216, + "language_loss": 0.86130506, + "learning_rate": 3.931683540242418e-06, + "loss": 0.8841458, + "num_input_tokens_seen": 19603405, + "step": 923, + "time_per_iteration": 2.62070369720459 + }, + { + "auxiliary_loss_clip": 0.01225107, + "auxiliary_loss_mlp": 0.01042776, + "balance_loss_clip": 1.06392956, + "balance_loss_mlp": 1.03000283, + "epoch": 0.11110443095052006, + "flos": 22960384888320.0, + "grad_norm": 2.956341397776146, + "language_loss": 0.9099561, + "learning_rate": 3.9314815368258295e-06, + "loss": 0.93263489, + "num_input_tokens_seen": 19619885, + "step": 924, + "time_per_iteration": 2.5184521675109863 + }, + { + "auxiliary_loss_clip": 0.01235169, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.07062948, + "balance_loss_mlp": 1.02430487, + "epoch": 0.11122467384115914, + "flos": 18950025265920.0, + "grad_norm": 1.7315162262529846, + "language_loss": 0.78997731, + "learning_rate": 3.9312792404050275e-06, + "loss": 0.81269085, + "num_input_tokens_seen": 19637940, + "step": 925, + "time_per_iteration": 2.5577445030212402 + }, + { + "auxiliary_loss_clip": 0.01245269, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.0685674, + "balance_loss_mlp": 1.03186452, + "epoch": 0.11134491673179824, + "flos": 25082957324160.0, + "grad_norm": 2.9284512006801258, + "language_loss": 0.77137518, + "learning_rate": 3.9310766510107e-06, + "loss": 0.79425347, + "num_input_tokens_seen": 19657115, + "step": 926, + "time_per_iteration": 2.555898427963257 + }, + { + "auxiliary_loss_clip": 0.01199394, + "auxiliary_loss_mlp": 0.01046524, + "balance_loss_clip": 1.05817652, + "balance_loss_mlp": 1.03373289, + "epoch": 0.11146515962243732, + "flos": 24499121662080.0, + "grad_norm": 1.7924464628446843, + "language_loss": 0.92370033, + "learning_rate": 3.9308737686735806e-06, + "loss": 0.94615948, + "num_input_tokens_seen": 19677075, + "step": 927, + "time_per_iteration": 2.6491332054138184 + }, + { + "auxiliary_loss_clip": 0.0124858, + "auxiliary_loss_mlp": 0.01048801, + "balance_loss_clip": 1.0684545, + "balance_loss_mlp": 1.03753579, + "epoch": 0.11158540251307641, + "flos": 22343763087360.0, + "grad_norm": 2.0353876311859302, + "language_loss": 0.82979387, + "learning_rate": 3.9306705934244455e-06, + "loss": 0.85276765, + "num_input_tokens_seen": 19697155, + "step": 928, + "time_per_iteration": 3.3209948539733887 + }, + { + "auxiliary_loss_clip": 0.01205064, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.06205869, + "balance_loss_mlp": 1.02527523, + "epoch": 0.11170564540371551, + "flos": 19902304684800.0, + "grad_norm": 1.7112590411728725, + "language_loss": 0.88181931, + "learning_rate": 3.930467125294116e-06, + "loss": 0.90423858, + "num_input_tokens_seen": 19716705, + "step": 929, + "time_per_iteration": 2.533008337020874 + }, + { + "auxiliary_loss_clip": 0.01068866, + "auxiliary_loss_mlp": 0.01006282, + "balance_loss_clip": 1.0246582, + "balance_loss_mlp": 1.00311112, + "epoch": 0.1118258882943546, + "flos": 64586239499520.0, + "grad_norm": 0.9239433802578965, + "language_loss": 0.6047616, + "learning_rate": 3.930263364313458e-06, + "loss": 0.62551308, + "num_input_tokens_seen": 19767275, + "step": 930, + "time_per_iteration": 3.112867832183838 + }, + { + "auxiliary_loss_clip": 0.0119898, + "auxiliary_loss_mlp": 0.01052218, + "balance_loss_clip": 1.06153238, + "balance_loss_mlp": 1.03958821, + "epoch": 0.11194613118499369, + "flos": 17201965985280.0, + "grad_norm": 2.0271495757930964, + "language_loss": 0.82767642, + "learning_rate": 3.930059310513384e-06, + "loss": 0.85018837, + "num_input_tokens_seen": 19786315, + "step": 931, + "time_per_iteration": 3.5619053840637207 + }, + { + "auxiliary_loss_clip": 0.01183773, + "auxiliary_loss_mlp": 0.00766056, + "balance_loss_clip": 1.05905557, + "balance_loss_mlp": 1.0001719, + "epoch": 0.11206637407563277, + "flos": 31863465728640.0, + "grad_norm": 1.8536881824331894, + "language_loss": 0.839719, + "learning_rate": 3.929854963924846e-06, + "loss": 0.85921729, + "num_input_tokens_seen": 19806580, + "step": 932, + "time_per_iteration": 2.6600515842437744 + }, + { + "auxiliary_loss_clip": 0.01201518, + "auxiliary_loss_mlp": 0.01037229, + "balance_loss_clip": 1.06075644, + "balance_loss_mlp": 1.02571964, + "epoch": 0.11218661696627187, + "flos": 21945621761280.0, + "grad_norm": 1.9242009350584048, + "language_loss": 0.77293712, + "learning_rate": 3.929650324578845e-06, + "loss": 0.79532456, + "num_input_tokens_seen": 19826045, + "step": 933, + "time_per_iteration": 2.5921401977539062 + }, + { + "auxiliary_loss_clip": 0.01220124, + "auxiliary_loss_mlp": 0.01043782, + "balance_loss_clip": 1.06360102, + "balance_loss_mlp": 1.03066325, + "epoch": 0.11230685985691095, + "flos": 25878198481920.0, + "grad_norm": 2.682825909780638, + "language_loss": 0.81794131, + "learning_rate": 3.929445392506423e-06, + "loss": 0.84058028, + "num_input_tokens_seen": 19843985, + "step": 934, + "time_per_iteration": 3.467923641204834 + }, + { + "auxiliary_loss_clip": 0.01231093, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_clip": 1.06905484, + "balance_loss_mlp": 1.03536761, + "epoch": 0.11242710274755005, + "flos": 22231506107520.0, + "grad_norm": 1.9855036795577263, + "language_loss": 0.7621538, + "learning_rate": 3.92924016773867e-06, + "loss": 0.78492975, + "num_input_tokens_seen": 19860480, + "step": 935, + "time_per_iteration": 3.287496328353882 + }, + { + "auxiliary_loss_clip": 0.01216609, + "auxiliary_loss_mlp": 0.00765709, + "balance_loss_clip": 1.06162333, + "balance_loss_mlp": 1.00018072, + "epoch": 0.11254734563818915, + "flos": 17712184723200.0, + "grad_norm": 2.498503811494862, + "language_loss": 0.73961407, + "learning_rate": 3.9290346503067175e-06, + "loss": 0.75943726, + "num_input_tokens_seen": 19877145, + "step": 936, + "time_per_iteration": 2.5347349643707275 + }, + { + "auxiliary_loss_clip": 0.0123198, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.0633018, + "balance_loss_mlp": 1.03251135, + "epoch": 0.11266758852882823, + "flos": 54930397334400.0, + "grad_norm": 1.879736759513515, + "language_loss": 0.78577709, + "learning_rate": 3.9288288402417415e-06, + "loss": 0.80853832, + "num_input_tokens_seen": 19903405, + "step": 937, + "time_per_iteration": 2.8518640995025635 + }, + { + "auxiliary_loss_clip": 0.01235733, + "auxiliary_loss_mlp": 0.01041264, + "balance_loss_clip": 1.06848824, + "balance_loss_mlp": 1.02865195, + "epoch": 0.11278783141946733, + "flos": 18878132194560.0, + "grad_norm": 2.4776593630824952, + "language_loss": 0.7051841, + "learning_rate": 3.928622737574964e-06, + "loss": 0.72795415, + "num_input_tokens_seen": 19918740, + "step": 938, + "time_per_iteration": 2.540126085281372 + }, + { + "auxiliary_loss_clip": 0.01213743, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_clip": 1.06124425, + "balance_loss_mlp": 1.03332162, + "epoch": 0.11290807431010641, + "flos": 26469252777600.0, + "grad_norm": 2.185617239783044, + "language_loss": 0.90913594, + "learning_rate": 3.928416342337652e-06, + "loss": 0.93172634, + "num_input_tokens_seen": 19938475, + "step": 939, + "time_per_iteration": 2.5731821060180664 + }, + { + "auxiliary_loss_clip": 0.01217515, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.06504345, + "balance_loss_mlp": 1.03062451, + "epoch": 0.1130283172007455, + "flos": 22710590732160.0, + "grad_norm": 1.709423476168821, + "language_loss": 0.82656705, + "learning_rate": 3.928209654561113e-06, + "loss": 0.84916717, + "num_input_tokens_seen": 19959310, + "step": 940, + "time_per_iteration": 2.55037522315979 + }, + { + "auxiliary_loss_clip": 0.01208729, + "auxiliary_loss_mlp": 0.01042085, + "balance_loss_clip": 1.06423962, + "balance_loss_mlp": 1.0308553, + "epoch": 0.1131485600913846, + "flos": 23219911630080.0, + "grad_norm": 2.2333480538657104, + "language_loss": 0.81722045, + "learning_rate": 3.928002674276703e-06, + "loss": 0.83972859, + "num_input_tokens_seen": 19978700, + "step": 941, + "time_per_iteration": 2.5602898597717285 + }, + { + "auxiliary_loss_clip": 0.01165758, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.0541929, + "balance_loss_mlp": 1.0266645, + "epoch": 0.11326880298202369, + "flos": 14064271286400.0, + "grad_norm": 2.308935222181706, + "language_loss": 0.75160742, + "learning_rate": 3.92779540151582e-06, + "loss": 0.77365482, + "num_input_tokens_seen": 19995785, + "step": 942, + "time_per_iteration": 2.5651979446411133 + }, + { + "auxiliary_loss_clip": 0.01213113, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.06258774, + "balance_loss_mlp": 1.02242732, + "epoch": 0.11338904587266278, + "flos": 16325386479360.0, + "grad_norm": 3.5205517173683547, + "language_loss": 0.8551293, + "learning_rate": 3.927587836309907e-06, + "loss": 0.87759399, + "num_input_tokens_seen": 20013615, + "step": 943, + "time_per_iteration": 2.5361077785491943 + }, + { + "auxiliary_loss_clip": 0.01210224, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.06141591, + "balance_loss_mlp": 1.03237987, + "epoch": 0.11350928876330187, + "flos": 24426258923520.0, + "grad_norm": 2.2304199453775584, + "language_loss": 0.78512174, + "learning_rate": 3.927379978690452e-06, + "loss": 0.80766165, + "num_input_tokens_seen": 20032880, + "step": 944, + "time_per_iteration": 2.5932183265686035 + }, + { + "auxiliary_loss_clip": 0.01185526, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.05306458, + "balance_loss_mlp": 1.03479791, + "epoch": 0.11362953165394096, + "flos": 24497074586880.0, + "grad_norm": 4.6681776614956965, + "language_loss": 0.87398177, + "learning_rate": 3.927171828688987e-06, + "loss": 0.89630222, + "num_input_tokens_seen": 20052405, + "step": 945, + "time_per_iteration": 2.6163206100463867 + }, + { + "auxiliary_loss_clip": 0.01249499, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.07157063, + "balance_loss_mlp": 1.02580023, + "epoch": 0.11374977454458005, + "flos": 24060831909120.0, + "grad_norm": 4.964377312919228, + "language_loss": 0.82049829, + "learning_rate": 3.926963386337088e-06, + "loss": 0.84336483, + "num_input_tokens_seen": 20070635, + "step": 946, + "time_per_iteration": 2.511021614074707 + }, + { + "auxiliary_loss_clip": 0.01251038, + "auxiliary_loss_mlp": 0.01040158, + "balance_loss_clip": 1.06869376, + "balance_loss_mlp": 1.026896, + "epoch": 0.11387001743521914, + "flos": 39457638967680.0, + "grad_norm": 2.477697303273926, + "language_loss": 0.70211494, + "learning_rate": 3.926754651666375e-06, + "loss": 0.72502685, + "num_input_tokens_seen": 20091195, + "step": 947, + "time_per_iteration": 2.6101086139678955 + }, + { + "auxiliary_loss_clip": 0.01201567, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_clip": 1.06499004, + "balance_loss_mlp": 1.03382611, + "epoch": 0.11399026032585824, + "flos": 25082454533760.0, + "grad_norm": 2.5759321953506773, + "language_loss": 0.78511375, + "learning_rate": 3.926545624708513e-06, + "loss": 0.80758703, + "num_input_tokens_seen": 20110435, + "step": 948, + "time_per_iteration": 2.662794351577759 + }, + { + "auxiliary_loss_clip": 0.01196644, + "auxiliary_loss_mlp": 0.01048158, + "balance_loss_clip": 1.06100798, + "balance_loss_mlp": 1.0364933, + "epoch": 0.11411050321649732, + "flos": 17961835224960.0, + "grad_norm": 2.2317643758693615, + "language_loss": 0.8537426, + "learning_rate": 3.926336305495213e-06, + "loss": 0.87619054, + "num_input_tokens_seen": 20128995, + "step": 949, + "time_per_iteration": 2.5853984355926514 + }, + { + "auxiliary_loss_clip": 0.01187161, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_clip": 1.06098485, + "balance_loss_mlp": 1.03126359, + "epoch": 0.11423074610713642, + "flos": 22455409536000.0, + "grad_norm": 2.057630330366888, + "language_loss": 0.88978767, + "learning_rate": 3.926126694058226e-06, + "loss": 0.91210735, + "num_input_tokens_seen": 20148145, + "step": 950, + "time_per_iteration": 2.6092495918273926 + }, + { + "auxiliary_loss_clip": 0.01183678, + "auxiliary_loss_mlp": 0.01044353, + "balance_loss_clip": 1.06493485, + "balance_loss_mlp": 1.03386319, + "epoch": 0.1143509889977755, + "flos": 19717687756800.0, + "grad_norm": 1.4908712306820662, + "language_loss": 0.82143223, + "learning_rate": 3.92591679042935e-06, + "loss": 0.84371257, + "num_input_tokens_seen": 20168035, + "step": 951, + "time_per_iteration": 2.6263771057128906 + }, + { + "auxiliary_loss_clip": 0.012316, + "auxiliary_loss_mlp": 0.01043318, + "balance_loss_clip": 1.06818664, + "balance_loss_mlp": 1.03068161, + "epoch": 0.1144712318884146, + "flos": 19822869757440.0, + "grad_norm": 1.9576754472941424, + "language_loss": 0.82164466, + "learning_rate": 3.92570659464043e-06, + "loss": 0.84439385, + "num_input_tokens_seen": 20186095, + "step": 952, + "time_per_iteration": 2.5014209747314453 + }, + { + "auxiliary_loss_clip": 0.01228435, + "auxiliary_loss_mlp": 0.00766193, + "balance_loss_clip": 1.06834412, + "balance_loss_mlp": 1.00021887, + "epoch": 0.1145914747790537, + "flos": 14939198766720.0, + "grad_norm": 2.068305216321503, + "language_loss": 0.79911947, + "learning_rate": 3.925496106723349e-06, + "loss": 0.81906575, + "num_input_tokens_seen": 20203535, + "step": 953, + "time_per_iteration": 2.4958319664001465 + }, + { + "auxiliary_loss_clip": 0.01234318, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.06794047, + "balance_loss_mlp": 1.02936339, + "epoch": 0.11471171766969278, + "flos": 19865029345920.0, + "grad_norm": 2.1686564435671607, + "language_loss": 0.83851522, + "learning_rate": 3.9252853267100405e-06, + "loss": 0.86126399, + "num_input_tokens_seen": 20222780, + "step": 954, + "time_per_iteration": 2.5247936248779297 + }, + { + "auxiliary_loss_clip": 0.01189794, + "auxiliary_loss_mlp": 0.01044997, + "balance_loss_clip": 1.06037807, + "balance_loss_mlp": 1.03305256, + "epoch": 0.11483196056033187, + "flos": 22526476594560.0, + "grad_norm": 4.389400191734664, + "language_loss": 0.83695138, + "learning_rate": 3.9250742546324786e-06, + "loss": 0.8592993, + "num_input_tokens_seen": 20243015, + "step": 955, + "time_per_iteration": 3.384770631790161 + }, + { + "auxiliary_loss_clip": 0.0121383, + "auxiliary_loss_mlp": 0.01043415, + "balance_loss_clip": 1.0634445, + "balance_loss_mlp": 1.03280544, + "epoch": 0.11495220345097096, + "flos": 28220292887040.0, + "grad_norm": 5.98804414515056, + "language_loss": 0.87021178, + "learning_rate": 3.924862890522683e-06, + "loss": 0.89278424, + "num_input_tokens_seen": 20263025, + "step": 956, + "time_per_iteration": 2.6026947498321533 + }, + { + "auxiliary_loss_clip": 0.0123029, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.06451845, + "balance_loss_mlp": 1.02943146, + "epoch": 0.11507244634161005, + "flos": 17492267704320.0, + "grad_norm": 2.104482533949446, + "language_loss": 0.85904467, + "learning_rate": 3.9246512344127174e-06, + "loss": 0.88175964, + "num_input_tokens_seen": 20280685, + "step": 957, + "time_per_iteration": 2.5276901721954346 + }, + { + "auxiliary_loss_clip": 0.01149633, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.05571187, + "balance_loss_mlp": 1.02764297, + "epoch": 0.11519268923224914, + "flos": 22564937082240.0, + "grad_norm": 3.0797442543689293, + "language_loss": 0.81831956, + "learning_rate": 3.9244392863346895e-06, + "loss": 0.84020722, + "num_input_tokens_seen": 20300090, + "step": 958, + "time_per_iteration": 3.444572925567627 + }, + { + "auxiliary_loss_clip": 0.01220195, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_clip": 1.06974697, + "balance_loss_mlp": 1.03428292, + "epoch": 0.11531293212288823, + "flos": 16982839065600.0, + "grad_norm": 1.9642476485491807, + "language_loss": 0.92335945, + "learning_rate": 3.9242270463207524e-06, + "loss": 0.94603091, + "num_input_tokens_seen": 20318480, + "step": 959, + "time_per_iteration": 2.530734062194824 + }, + { + "auxiliary_loss_clip": 0.01170837, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.05797625, + "balance_loss_mlp": 1.0292151, + "epoch": 0.11543317501352733, + "flos": 12422004537600.0, + "grad_norm": 5.7963241261524105, + "language_loss": 0.85763395, + "learning_rate": 3.924014514403102e-06, + "loss": 0.87975836, + "num_input_tokens_seen": 20334635, + "step": 960, + "time_per_iteration": 2.582185745239258 + }, + { + "auxiliary_loss_clip": 0.01173004, + "auxiliary_loss_mlp": 0.01047938, + "balance_loss_clip": 1.05811596, + "balance_loss_mlp": 1.03498578, + "epoch": 0.11555341790416641, + "flos": 19821648695040.0, + "grad_norm": 2.0216681726805295, + "language_loss": 0.91281712, + "learning_rate": 3.92380169061398e-06, + "loss": 0.93502653, + "num_input_tokens_seen": 20352415, + "step": 961, + "time_per_iteration": 3.510355234146118 + }, + { + "auxiliary_loss_clip": 0.01190988, + "auxiliary_loss_mlp": 0.00766267, + "balance_loss_clip": 1.05787051, + "balance_loss_mlp": 1.00016916, + "epoch": 0.11567366079480551, + "flos": 25738865625600.0, + "grad_norm": 4.168283357649946, + "language_loss": 0.83740705, + "learning_rate": 3.9235885749856705e-06, + "loss": 0.85697961, + "num_input_tokens_seen": 20371095, + "step": 962, + "time_per_iteration": 3.366799831390381 + }, + { + "auxiliary_loss_clip": 0.01221592, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.07287169, + "balance_loss_mlp": 1.03172088, + "epoch": 0.1157939036854446, + "flos": 18223301301120.0, + "grad_norm": 2.3843259083233894, + "language_loss": 0.82769305, + "learning_rate": 3.9233751675505035e-06, + "loss": 0.85034811, + "num_input_tokens_seen": 20389805, + "step": 963, + "time_per_iteration": 2.538644790649414 + }, + { + "auxiliary_loss_clip": 0.01211749, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.0667789, + "balance_loss_mlp": 1.02849495, + "epoch": 0.11591414657608369, + "flos": 23073755189760.0, + "grad_norm": 1.8655173095163404, + "language_loss": 0.84628808, + "learning_rate": 3.923161468340853e-06, + "loss": 0.86882311, + "num_input_tokens_seen": 20409640, + "step": 964, + "time_per_iteration": 2.5701324939727783 + }, + { + "auxiliary_loss_clip": 0.01170812, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_clip": 1.05627096, + "balance_loss_mlp": 1.02952552, + "epoch": 0.11603438946672277, + "flos": 19461716461440.0, + "grad_norm": 1.6981682620451493, + "language_loss": 0.81574696, + "learning_rate": 3.9229474773891374e-06, + "loss": 0.83786833, + "num_input_tokens_seen": 20428180, + "step": 965, + "time_per_iteration": 2.60032320022583 + }, + { + "auxiliary_loss_clip": 0.01205725, + "auxiliary_loss_mlp": 0.01050564, + "balance_loss_clip": 1.05884755, + "balance_loss_mlp": 1.03767192, + "epoch": 0.11615463235736187, + "flos": 26831986272000.0, + "grad_norm": 3.0569375799848006, + "language_loss": 0.83467007, + "learning_rate": 3.922733194727818e-06, + "loss": 0.85723293, + "num_input_tokens_seen": 20447975, + "step": 966, + "time_per_iteration": 2.639197826385498 + }, + { + "auxiliary_loss_clip": 0.01236906, + "auxiliary_loss_mlp": 0.01042939, + "balance_loss_clip": 1.06900549, + "balance_loss_mlp": 1.03072011, + "epoch": 0.11627487524800097, + "flos": 18580324533120.0, + "grad_norm": 2.3384779580136685, + "language_loss": 0.8749249, + "learning_rate": 3.922518620389402e-06, + "loss": 0.89772332, + "num_input_tokens_seen": 20464840, + "step": 967, + "time_per_iteration": 2.4678986072540283 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.05272841, + "balance_loss_mlp": 1.02700853, + "epoch": 0.11639511813864005, + "flos": 18150474476160.0, + "grad_norm": 3.5046784383285283, + "language_loss": 0.89498007, + "learning_rate": 3.922303754406439e-06, + "loss": 0.91661251, + "num_input_tokens_seen": 20482680, + "step": 968, + "time_per_iteration": 2.648559331893921 + }, + { + "auxiliary_loss_clip": 0.01181333, + "auxiliary_loss_mlp": 0.01049982, + "balance_loss_clip": 1.05794382, + "balance_loss_mlp": 1.03638005, + "epoch": 0.11651536102927915, + "flos": 20922023888640.0, + "grad_norm": 2.4029588315319392, + "language_loss": 0.79096127, + "learning_rate": 3.922088596811526e-06, + "loss": 0.81327444, + "num_input_tokens_seen": 20501810, + "step": 969, + "time_per_iteration": 2.5997016429901123 + }, + { + "auxiliary_loss_clip": 0.01216686, + "auxiliary_loss_mlp": 0.01041144, + "balance_loss_clip": 1.06253982, + "balance_loss_mlp": 1.02989638, + "epoch": 0.11663560391991823, + "flos": 16508602776960.0, + "grad_norm": 2.2351264901267016, + "language_loss": 0.86859179, + "learning_rate": 3.9218731476373e-06, + "loss": 0.89117014, + "num_input_tokens_seen": 20517995, + "step": 970, + "time_per_iteration": 2.4892120361328125 + }, + { + "auxiliary_loss_clip": 0.01239136, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.07091939, + "balance_loss_mlp": 1.03542817, + "epoch": 0.11675584681055733, + "flos": 19865029345920.0, + "grad_norm": 2.091045713161438, + "language_loss": 0.84533858, + "learning_rate": 3.9216574069164455e-06, + "loss": 0.86821598, + "num_input_tokens_seen": 20536970, + "step": 971, + "time_per_iteration": 2.4939916133880615 + }, + { + "auxiliary_loss_clip": 0.01242973, + "auxiliary_loss_mlp": 0.01041652, + "balance_loss_clip": 1.06727505, + "balance_loss_mlp": 1.03064299, + "epoch": 0.11687608970119642, + "flos": 21944364785280.0, + "grad_norm": 1.5287384535308486, + "language_loss": 0.79943645, + "learning_rate": 3.921441374681691e-06, + "loss": 0.82228267, + "num_input_tokens_seen": 20557030, + "step": 972, + "time_per_iteration": 2.519932508468628 + }, + { + "auxiliary_loss_clip": 0.0121033, + "auxiliary_loss_mlp": 0.01038171, + "balance_loss_clip": 1.06396246, + "balance_loss_mlp": 1.0262444, + "epoch": 0.1169963325918355, + "flos": 24061155131520.0, + "grad_norm": 2.1838902758807968, + "language_loss": 0.65210819, + "learning_rate": 3.921225050965808e-06, + "loss": 0.67459321, + "num_input_tokens_seen": 20576915, + "step": 973, + "time_per_iteration": 2.5642313957214355 + }, + { + "auxiliary_loss_clip": 0.01196285, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.06056392, + "balance_loss_mlp": 1.02817452, + "epoch": 0.1171165754824746, + "flos": 23368151059200.0, + "grad_norm": 2.168247555973414, + "language_loss": 0.74746329, + "learning_rate": 3.921008435801612e-06, + "loss": 0.76982886, + "num_input_tokens_seen": 20596000, + "step": 974, + "time_per_iteration": 2.5717360973358154 + }, + { + "auxiliary_loss_clip": 0.01217838, + "auxiliary_loss_mlp": 0.01040911, + "balance_loss_clip": 1.06437182, + "balance_loss_mlp": 1.02840626, + "epoch": 0.11723681837311369, + "flos": 18552243075840.0, + "grad_norm": 3.907495793045152, + "language_loss": 0.75140554, + "learning_rate": 3.920791529221963e-06, + "loss": 0.77399302, + "num_input_tokens_seen": 20614675, + "step": 975, + "time_per_iteration": 2.504629135131836 + }, + { + "auxiliary_loss_clip": 0.01218052, + "auxiliary_loss_mlp": 0.00766203, + "balance_loss_clip": 1.06507802, + "balance_loss_mlp": 1.00019956, + "epoch": 0.11735706126375278, + "flos": 23550541344000.0, + "grad_norm": 2.22358341578932, + "language_loss": 0.7644068, + "learning_rate": 3.920574331259768e-06, + "loss": 0.78424937, + "num_input_tokens_seen": 20635875, + "step": 976, + "time_per_iteration": 2.600533962249756 + }, + { + "auxiliary_loss_clip": 0.01203751, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.0616771, + "balance_loss_mlp": 1.02849603, + "epoch": 0.11747730415439187, + "flos": 22381541216640.0, + "grad_norm": 2.478980233029111, + "language_loss": 0.79192472, + "learning_rate": 3.9203568419479716e-06, + "loss": 0.81435591, + "num_input_tokens_seen": 20656430, + "step": 977, + "time_per_iteration": 2.5490169525146484 + }, + { + "auxiliary_loss_clip": 0.01213339, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.06437027, + "balance_loss_mlp": 1.02024031, + "epoch": 0.11759754704503096, + "flos": 22200731130240.0, + "grad_norm": 1.95727999590609, + "language_loss": 0.75444448, + "learning_rate": 3.92013906131957e-06, + "loss": 0.77689034, + "num_input_tokens_seen": 20675360, + "step": 978, + "time_per_iteration": 2.5403544902801514 + }, + { + "auxiliary_loss_clip": 0.01197648, + "auxiliary_loss_mlp": 0.01051672, + "balance_loss_clip": 1.06457043, + "balance_loss_mlp": 1.04108584, + "epoch": 0.11771778993567006, + "flos": 22309755886080.0, + "grad_norm": 1.6120511554181212, + "language_loss": 0.8221814, + "learning_rate": 3.9199209894076e-06, + "loss": 0.84467459, + "num_input_tokens_seen": 20695675, + "step": 979, + "time_per_iteration": 2.5954396724700928 + }, + { + "auxiliary_loss_clip": 0.01246823, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.06715775, + "balance_loss_mlp": 1.02373064, + "epoch": 0.11783803282630914, + "flos": 21288169175040.0, + "grad_norm": 1.808185574917874, + "language_loss": 0.89461297, + "learning_rate": 3.919702626245142e-06, + "loss": 0.91744941, + "num_input_tokens_seen": 20715330, + "step": 980, + "time_per_iteration": 2.491637706756592 + }, + { + "auxiliary_loss_clip": 0.01199504, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.06062829, + "balance_loss_mlp": 1.02705121, + "epoch": 0.11795827571694824, + "flos": 25371535190400.0, + "grad_norm": 2.1781573558250193, + "language_loss": 0.6653322, + "learning_rate": 3.919483971865322e-06, + "loss": 0.68771684, + "num_input_tokens_seen": 20735325, + "step": 981, + "time_per_iteration": 3.409590482711792 + }, + { + "auxiliary_loss_clip": 0.01212901, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.06766129, + "balance_loss_mlp": 1.02573252, + "epoch": 0.11807851860758732, + "flos": 23622218933760.0, + "grad_norm": 2.1999078750208447, + "language_loss": 0.88020611, + "learning_rate": 3.91926502630131e-06, + "loss": 0.9027018, + "num_input_tokens_seen": 20755940, + "step": 982, + "time_per_iteration": 2.5646488666534424 + }, + { + "auxiliary_loss_clip": 0.01234963, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.07178938, + "balance_loss_mlp": 1.0300653, + "epoch": 0.11819876149822642, + "flos": 24972496024320.0, + "grad_norm": 1.8677443841249768, + "language_loss": 0.72238576, + "learning_rate": 3.91904578958632e-06, + "loss": 0.74514914, + "num_input_tokens_seen": 20775355, + "step": 983, + "time_per_iteration": 2.537376642227173 + }, + { + "auxiliary_loss_clip": 0.01248471, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.06952524, + "balance_loss_mlp": 1.03206599, + "epoch": 0.11831900438886551, + "flos": 23003226835200.0, + "grad_norm": 4.291269702588566, + "language_loss": 0.83957946, + "learning_rate": 3.918826261753608e-06, + "loss": 0.86250186, + "num_input_tokens_seen": 20794935, + "step": 984, + "time_per_iteration": 2.5083096027374268 + }, + { + "auxiliary_loss_clip": 0.01212761, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.06468964, + "balance_loss_mlp": 1.0236696, + "epoch": 0.1184392472795046, + "flos": 27965147604480.0, + "grad_norm": 2.5894371408203622, + "language_loss": 0.70987403, + "learning_rate": 3.918606442836478e-06, + "loss": 0.73233777, + "num_input_tokens_seen": 20817155, + "step": 985, + "time_per_iteration": 3.4155235290527344 + }, + { + "auxiliary_loss_clip": 0.01228825, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.07058084, + "balance_loss_mlp": 1.02657628, + "epoch": 0.1185594901701437, + "flos": 19898497843200.0, + "grad_norm": 1.841555434564848, + "language_loss": 0.77150846, + "learning_rate": 3.918386332868277e-06, + "loss": 0.79417026, + "num_input_tokens_seen": 20835125, + "step": 986, + "time_per_iteration": 2.5039713382720947 + }, + { + "auxiliary_loss_clip": 0.01218375, + "auxiliary_loss_mlp": 0.01045865, + "balance_loss_clip": 1.06409431, + "balance_loss_mlp": 1.03445721, + "epoch": 0.11867973306078278, + "flos": 18912354877440.0, + "grad_norm": 1.845581159073504, + "language_loss": 0.94383878, + "learning_rate": 3.918165931882394e-06, + "loss": 0.96648121, + "num_input_tokens_seen": 20853525, + "step": 987, + "time_per_iteration": 2.5112380981445312 + }, + { + "auxiliary_loss_clip": 0.0115319, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.05210078, + "balance_loss_mlp": 1.02598, + "epoch": 0.11879997595142187, + "flos": 16982803152000.0, + "grad_norm": 3.3857860573184153, + "language_loss": 0.75145078, + "learning_rate": 3.917945239912264e-06, + "loss": 0.7733677, + "num_input_tokens_seen": 20871000, + "step": 988, + "time_per_iteration": 4.2755584716796875 + }, + { + "auxiliary_loss_clip": 0.01178952, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.05881488, + "balance_loss_mlp": 1.03004456, + "epoch": 0.11892021884206096, + "flos": 17530369056000.0, + "grad_norm": 1.9657056685116383, + "language_loss": 0.75387728, + "learning_rate": 3.917724256991367e-06, + "loss": 0.77607089, + "num_input_tokens_seen": 20889745, + "step": 989, + "time_per_iteration": 2.686502456665039 + }, + { + "auxiliary_loss_clip": 0.01203132, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_clip": 1.06265628, + "balance_loss_mlp": 1.03812623, + "epoch": 0.11904046173270005, + "flos": 30955895763840.0, + "grad_norm": 2.1633720294946435, + "language_loss": 0.81432229, + "learning_rate": 3.9175029831532245e-06, + "loss": 0.83684963, + "num_input_tokens_seen": 20909260, + "step": 990, + "time_per_iteration": 2.6568806171417236 + }, + { + "auxiliary_loss_clip": 0.01202964, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.06832671, + "balance_loss_mlp": 1.02449894, + "epoch": 0.11916070462333915, + "flos": 20157234485760.0, + "grad_norm": 2.0790307712942884, + "language_loss": 0.88281453, + "learning_rate": 3.917281418431404e-06, + "loss": 0.90519357, + "num_input_tokens_seen": 20928305, + "step": 991, + "time_per_iteration": 2.6638309955596924 + }, + { + "auxiliary_loss_clip": 0.01212428, + "auxiliary_loss_mlp": 0.01042167, + "balance_loss_clip": 1.06696117, + "balance_loss_mlp": 1.0302701, + "epoch": 0.11928094751397823, + "flos": 23551115961600.0, + "grad_norm": 1.9549558493065298, + "language_loss": 0.76683331, + "learning_rate": 3.917059562859516e-06, + "loss": 0.78937924, + "num_input_tokens_seen": 20947630, + "step": 992, + "time_per_iteration": 2.564865827560425 + }, + { + "auxiliary_loss_clip": 0.01204012, + "auxiliary_loss_mlp": 0.01048542, + "balance_loss_clip": 1.0655551, + "balance_loss_mlp": 1.03557777, + "epoch": 0.11940119040461733, + "flos": 23908426502400.0, + "grad_norm": 2.5006763860702588, + "language_loss": 0.88598096, + "learning_rate": 3.916837416471218e-06, + "loss": 0.90850651, + "num_input_tokens_seen": 20964250, + "step": 993, + "time_per_iteration": 2.5598461627960205 + }, + { + "auxiliary_loss_clip": 0.01221992, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.06321526, + "balance_loss_mlp": 1.0302918, + "epoch": 0.11952143329525641, + "flos": 13844533835520.0, + "grad_norm": 3.045862317496398, + "language_loss": 0.71790588, + "learning_rate": 3.916614979300207e-06, + "loss": 0.74054313, + "num_input_tokens_seen": 20979095, + "step": 994, + "time_per_iteration": 2.4886584281921387 + }, + { + "auxiliary_loss_clip": 0.01170908, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.05977333, + "balance_loss_mlp": 1.03173828, + "epoch": 0.11964167618589551, + "flos": 27015525792000.0, + "grad_norm": 1.554600703162546, + "language_loss": 0.78779697, + "learning_rate": 3.9163922513802274e-06, + "loss": 0.80993176, + "num_input_tokens_seen": 21001430, + "step": 995, + "time_per_iteration": 2.7433764934539795 + }, + { + "auxiliary_loss_clip": 0.01247442, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.06869578, + "balance_loss_mlp": 1.02862787, + "epoch": 0.1197619190765346, + "flos": 12567622273920.0, + "grad_norm": 2.7222687295562857, + "language_loss": 0.82726228, + "learning_rate": 3.916169232745067e-06, + "loss": 0.85013628, + "num_input_tokens_seen": 21019105, + "step": 996, + "time_per_iteration": 2.5554778575897217 + }, + { + "auxiliary_loss_clip": 0.01201251, + "auxiliary_loss_mlp": 0.01045157, + "balance_loss_clip": 1.0626092, + "balance_loss_mlp": 1.033355, + "epoch": 0.11988216196717369, + "flos": 16909437623040.0, + "grad_norm": 2.586800204020113, + "language_loss": 0.91545796, + "learning_rate": 3.915945923428559e-06, + "loss": 0.93792206, + "num_input_tokens_seen": 21035630, + "step": 997, + "time_per_iteration": 2.5254056453704834 + }, + { + "auxiliary_loss_clip": 0.01223692, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.06402433, + "balance_loss_mlp": 1.02670395, + "epoch": 0.12000240485781279, + "flos": 16216577205120.0, + "grad_norm": 2.3763455338426884, + "language_loss": 0.82817006, + "learning_rate": 3.915722323464577e-06, + "loss": 0.85078937, + "num_input_tokens_seen": 21054235, + "step": 998, + "time_per_iteration": 2.485231876373291 + }, + { + "auxiliary_loss_clip": 0.01229292, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.06680942, + "balance_loss_mlp": 1.03034723, + "epoch": 0.12012264774845187, + "flos": 49344887525760.0, + "grad_norm": 2.3394692009951137, + "language_loss": 0.70101988, + "learning_rate": 3.91549843288704e-06, + "loss": 0.72373348, + "num_input_tokens_seen": 21077915, + "step": 999, + "time_per_iteration": 2.760650396347046 + }, + { + "auxiliary_loss_clip": 0.01193444, + "auxiliary_loss_mlp": 0.00765927, + "balance_loss_clip": 1.05798531, + "balance_loss_mlp": 1.00029325, + "epoch": 0.12024289063909097, + "flos": 26979435601920.0, + "grad_norm": 1.9741332994676182, + "language_loss": 0.79050684, + "learning_rate": 3.915274251729916e-06, + "loss": 0.81010056, + "num_input_tokens_seen": 21099205, + "step": 1000, + "time_per_iteration": 2.697967290878296 + }, + { + "auxiliary_loss_clip": 0.01199432, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.0646801, + "balance_loss_mlp": 1.02346087, + "epoch": 0.12036313352973005, + "flos": 19537308633600.0, + "grad_norm": 1.9642043382316006, + "language_loss": 0.89977753, + "learning_rate": 3.91504978002721e-06, + "loss": 0.92212522, + "num_input_tokens_seen": 21118260, + "step": 1001, + "time_per_iteration": 2.580869197845459 + }, + { + "auxiliary_loss_clip": 0.01215672, + "auxiliary_loss_mlp": 0.00765759, + "balance_loss_clip": 1.06333685, + "balance_loss_mlp": 1.00017905, + "epoch": 0.12048337642036915, + "flos": 17268256535040.0, + "grad_norm": 1.956488779013999, + "language_loss": 0.76307422, + "learning_rate": 3.914825017812974e-06, + "loss": 0.78288853, + "num_input_tokens_seen": 21134910, + "step": 1002, + "time_per_iteration": 2.6101584434509277 + }, + { + "auxiliary_loss_clip": 0.0121423, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.06611443, + "balance_loss_mlp": 1.03052139, + "epoch": 0.12060361931100824, + "flos": 22856962654080.0, + "grad_norm": 2.6868168685223743, + "language_loss": 0.72553027, + "learning_rate": 3.9145999651213065e-06, + "loss": 0.74809587, + "num_input_tokens_seen": 21154150, + "step": 1003, + "time_per_iteration": 2.6212010383605957 + }, + { + "auxiliary_loss_clip": 0.01230815, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.06716847, + "balance_loss_mlp": 1.03109193, + "epoch": 0.12072386220164733, + "flos": 16726795943040.0, + "grad_norm": 3.28910021969422, + "language_loss": 0.88465434, + "learning_rate": 3.9143746219863465e-06, + "loss": 0.9073981, + "num_input_tokens_seen": 21171255, + "step": 1004, + "time_per_iteration": 2.501833915710449 + }, + { + "auxiliary_loss_clip": 0.01119583, + "auxiliary_loss_mlp": 0.01007689, + "balance_loss_clip": 1.03047168, + "balance_loss_mlp": 1.00439918, + "epoch": 0.12084410509228642, + "flos": 55144176105600.0, + "grad_norm": 0.9718233589977261, + "language_loss": 0.64785594, + "learning_rate": 3.914148988442278e-06, + "loss": 0.66912866, + "num_input_tokens_seen": 21227045, + "step": 1005, + "time_per_iteration": 3.104682683944702 + }, + { + "auxiliary_loss_clip": 0.01200345, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.06267571, + "balance_loss_mlp": 1.02454448, + "epoch": 0.1209643479829255, + "flos": 26760236855040.0, + "grad_norm": 2.7257032149404576, + "language_loss": 0.94802999, + "learning_rate": 3.91392306452333e-06, + "loss": 0.97039801, + "num_input_tokens_seen": 21244120, + "step": 1006, + "time_per_iteration": 2.603006601333618 + }, + { + "auxiliary_loss_clip": 0.0124971, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.07035995, + "balance_loss_mlp": 1.02459073, + "epoch": 0.1210845908735646, + "flos": 11035026725760.0, + "grad_norm": 4.355028201786306, + "language_loss": 0.66476691, + "learning_rate": 3.913696850263774e-06, + "loss": 0.68762624, + "num_input_tokens_seen": 21258485, + "step": 1007, + "time_per_iteration": 2.5299501419067383 + }, + { + "auxiliary_loss_clip": 0.01228557, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.06662476, + "balance_loss_mlp": 1.02625632, + "epoch": 0.1212048337642037, + "flos": 20484631975680.0, + "grad_norm": 2.345442799468082, + "language_loss": 0.79027092, + "learning_rate": 3.913470345697929e-06, + "loss": 0.81293243, + "num_input_tokens_seen": 21277115, + "step": 1008, + "time_per_iteration": 3.3289883136749268 + }, + { + "auxiliary_loss_clip": 0.01184583, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.06186759, + "balance_loss_mlp": 1.02905202, + "epoch": 0.12132507665484278, + "flos": 22346061557760.0, + "grad_norm": 2.14806818769836, + "language_loss": 0.85448384, + "learning_rate": 3.913243550860153e-06, + "loss": 0.87673414, + "num_input_tokens_seen": 21294880, + "step": 1009, + "time_per_iteration": 2.643724203109741 + }, + { + "auxiliary_loss_clip": 0.01235156, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.07236242, + "balance_loss_mlp": 1.02952623, + "epoch": 0.12144531954548188, + "flos": 29314957818240.0, + "grad_norm": 4.906364335392105, + "language_loss": 0.76191723, + "learning_rate": 3.913016465784852e-06, + "loss": 0.78468519, + "num_input_tokens_seen": 21315555, + "step": 1010, + "time_per_iteration": 2.5796854496002197 + }, + { + "auxiliary_loss_clip": 0.01181669, + "auxiliary_loss_mlp": 0.01040886, + "balance_loss_clip": 1.05798078, + "balance_loss_mlp": 1.02863169, + "epoch": 0.12156556243612096, + "flos": 20485242506880.0, + "grad_norm": 2.396291420679566, + "language_loss": 0.72145033, + "learning_rate": 3.912789090506474e-06, + "loss": 0.74367583, + "num_input_tokens_seen": 21334815, + "step": 1011, + "time_per_iteration": 2.6079001426696777 + }, + { + "auxiliary_loss_clip": 0.01204323, + "auxiliary_loss_mlp": 0.01046784, + "balance_loss_clip": 1.0612576, + "balance_loss_mlp": 1.03437412, + "epoch": 0.12168580532676006, + "flos": 16472009796480.0, + "grad_norm": 2.8260098849450337, + "language_loss": 0.71941292, + "learning_rate": 3.9125614250595114e-06, + "loss": 0.74192393, + "num_input_tokens_seen": 21351025, + "step": 1012, + "time_per_iteration": 3.292996406555176 + }, + { + "auxiliary_loss_clip": 0.01228391, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.06464124, + "balance_loss_mlp": 1.0263629, + "epoch": 0.12180604821739914, + "flos": 15341290588800.0, + "grad_norm": 4.1629457312837275, + "language_loss": 0.8891508, + "learning_rate": 3.912333469478502e-06, + "loss": 0.91181874, + "num_input_tokens_seen": 21368990, + "step": 1013, + "time_per_iteration": 2.4810686111450195 + }, + { + "auxiliary_loss_clip": 0.01211681, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.06220818, + "balance_loss_mlp": 1.02556276, + "epoch": 0.12192629110803824, + "flos": 19318038059520.0, + "grad_norm": 2.192660204255425, + "language_loss": 0.7811994, + "learning_rate": 3.912105223798025e-06, + "loss": 0.80368388, + "num_input_tokens_seen": 21388410, + "step": 1014, + "time_per_iteration": 4.213102102279663 + }, + { + "auxiliary_loss_clip": 0.01104352, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.02628875, + "balance_loss_mlp": 1.00400352, + "epoch": 0.12204653399867733, + "flos": 47725354085760.0, + "grad_norm": 1.004827507167101, + "language_loss": 0.67704082, + "learning_rate": 3.9118766880527065e-06, + "loss": 0.69815409, + "num_input_tokens_seen": 21442845, + "step": 1015, + "time_per_iteration": 3.033341884613037 + }, + { + "auxiliary_loss_clip": 0.01172378, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.05895948, + "balance_loss_mlp": 1.02293396, + "epoch": 0.12216677688931642, + "flos": 18221936584320.0, + "grad_norm": 1.7625180339866533, + "language_loss": 0.73916751, + "learning_rate": 3.9116478622772145e-06, + "loss": 0.76123047, + "num_input_tokens_seen": 21461420, + "step": 1016, + "time_per_iteration": 2.603590965270996 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01047251, + "balance_loss_clip": 1.06761563, + "balance_loss_mlp": 1.03543186, + "epoch": 0.12228701977995551, + "flos": 27525636789120.0, + "grad_norm": 1.641851045288504, + "language_loss": 0.87946522, + "learning_rate": 3.911418746506261e-06, + "loss": 0.90222299, + "num_input_tokens_seen": 21481550, + "step": 1017, + "time_per_iteration": 2.5945804119110107 + }, + { + "auxiliary_loss_clip": 0.01236243, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_clip": 1.07309771, + "balance_loss_mlp": 1.03404868, + "epoch": 0.1224072626705946, + "flos": 21798136517760.0, + "grad_norm": 1.7461045162125741, + "language_loss": 0.78283679, + "learning_rate": 3.911189340774604e-06, + "loss": 0.80565834, + "num_input_tokens_seen": 21501680, + "step": 1018, + "time_per_iteration": 2.583975315093994 + }, + { + "auxiliary_loss_clip": 0.01222008, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.06582403, + "balance_loss_mlp": 1.0284853, + "epoch": 0.1225275055612337, + "flos": 20703758895360.0, + "grad_norm": 1.8798613184521535, + "language_loss": 0.79470217, + "learning_rate": 3.910959645117043e-06, + "loss": 0.81732595, + "num_input_tokens_seen": 21521015, + "step": 1019, + "time_per_iteration": 2.5504541397094727 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.00755753, + "balance_loss_clip": 1.02608109, + "balance_loss_mlp": 0.99982893, + "epoch": 0.12264774845187278, + "flos": 57745294462080.0, + "grad_norm": 0.8250716008429229, + "language_loss": 0.5673728, + "learning_rate": 3.910729659568423e-06, + "loss": 0.58601809, + "num_input_tokens_seen": 21578200, + "step": 1020, + "time_per_iteration": 3.0903711318969727 + }, + { + "auxiliary_loss_clip": 0.01214141, + "auxiliary_loss_mlp": 0.01040553, + "balance_loss_clip": 1.06681371, + "balance_loss_mlp": 1.02984226, + "epoch": 0.12276799134251187, + "flos": 26396282298240.0, + "grad_norm": 1.8921659514973732, + "language_loss": 0.82302976, + "learning_rate": 3.9104993841636344e-06, + "loss": 0.8455767, + "num_input_tokens_seen": 21598770, + "step": 1021, + "time_per_iteration": 2.5958285331726074 + }, + { + "auxiliary_loss_clip": 0.01213043, + "auxiliary_loss_mlp": 0.00765046, + "balance_loss_clip": 1.06837499, + "balance_loss_mlp": 1.00014257, + "epoch": 0.12288823423315097, + "flos": 21064193919360.0, + "grad_norm": 1.838728974577964, + "language_loss": 0.80708408, + "learning_rate": 3.910268818937608e-06, + "loss": 0.82686502, + "num_input_tokens_seen": 21616925, + "step": 1022, + "time_per_iteration": 2.6083619594573975 + }, + { + "auxiliary_loss_clip": 0.01182559, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.06323099, + "balance_loss_mlp": 1.02887464, + "epoch": 0.12300847712379005, + "flos": 12312441077760.0, + "grad_norm": 2.79147987123607, + "language_loss": 0.87200677, + "learning_rate": 3.9100379639253196e-06, + "loss": 0.89423394, + "num_input_tokens_seen": 21633645, + "step": 1023, + "time_per_iteration": 2.563525676727295 + }, + { + "auxiliary_loss_clip": 0.01209931, + "auxiliary_loss_mlp": 0.01038633, + "balance_loss_clip": 1.05983126, + "balance_loss_mlp": 1.02677727, + "epoch": 0.12312872001442915, + "flos": 16762239688320.0, + "grad_norm": 7.128808907151381, + "language_loss": 0.86316663, + "learning_rate": 3.909806819161791e-06, + "loss": 0.8856523, + "num_input_tokens_seen": 21649120, + "step": 1024, + "time_per_iteration": 2.5015146732330322 + }, + { + "auxiliary_loss_clip": 0.01200876, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.06189966, + "balance_loss_mlp": 1.02354836, + "epoch": 0.12324896290506823, + "flos": 18404937400320.0, + "grad_norm": 3.367361678904254, + "language_loss": 0.85992765, + "learning_rate": 3.909575384682086e-06, + "loss": 0.88228869, + "num_input_tokens_seen": 21668000, + "step": 1025, + "time_per_iteration": 2.5635976791381836 + }, + { + "auxiliary_loss_clip": 0.01230146, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_clip": 1.06511676, + "balance_loss_mlp": 1.04119062, + "epoch": 0.12336920579570733, + "flos": 18915407533440.0, + "grad_norm": 2.107270800292623, + "language_loss": 0.69035459, + "learning_rate": 3.9093436605213144e-06, + "loss": 0.71318769, + "num_input_tokens_seen": 21688500, + "step": 1026, + "time_per_iteration": 2.5208303928375244 + }, + { + "auxiliary_loss_clip": 0.01213136, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.06448293, + "balance_loss_mlp": 1.03399217, + "epoch": 0.12348944868634643, + "flos": 23878369797120.0, + "grad_norm": 1.931677531205367, + "language_loss": 0.79118574, + "learning_rate": 3.909111646714627e-06, + "loss": 0.81376833, + "num_input_tokens_seen": 21709345, + "step": 1027, + "time_per_iteration": 2.563126802444458 + }, + { + "auxiliary_loss_clip": 0.01239718, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.06591201, + "balance_loss_mlp": 1.02241564, + "epoch": 0.12360969157698551, + "flos": 19026084314880.0, + "grad_norm": 3.2425410989092485, + "language_loss": 0.72293043, + "learning_rate": 3.9088793432972206e-06, + "loss": 0.74565667, + "num_input_tokens_seen": 21728165, + "step": 1028, + "time_per_iteration": 2.495054006576538 + }, + { + "auxiliary_loss_clip": 0.0118091, + "auxiliary_loss_mlp": 0.01041615, + "balance_loss_clip": 1.06134009, + "balance_loss_mlp": 1.03015304, + "epoch": 0.1237299344676246, + "flos": 13224607983360.0, + "grad_norm": 2.1911629470058616, + "language_loss": 0.81957233, + "learning_rate": 3.908646750304336e-06, + "loss": 0.84179765, + "num_input_tokens_seen": 21745850, + "step": 1029, + "time_per_iteration": 2.695385217666626 + }, + { + "auxiliary_loss_clip": 0.01215696, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.06603193, + "balance_loss_mlp": 1.02723098, + "epoch": 0.12385017735826369, + "flos": 20485673470080.0, + "grad_norm": 1.736101877835852, + "language_loss": 0.87471676, + "learning_rate": 3.908413867771257e-06, + "loss": 0.89725852, + "num_input_tokens_seen": 21764760, + "step": 1030, + "time_per_iteration": 2.531994342803955 + }, + { + "auxiliary_loss_clip": 0.01227987, + "auxiliary_loss_mlp": 0.01044605, + "balance_loss_clip": 1.06733656, + "balance_loss_mlp": 1.03248167, + "epoch": 0.12397042024890279, + "flos": 17347835116800.0, + "grad_norm": 1.783422361228728, + "language_loss": 0.80791593, + "learning_rate": 3.908180695733311e-06, + "loss": 0.83064187, + "num_input_tokens_seen": 21784250, + "step": 1031, + "time_per_iteration": 2.479642152786255 + }, + { + "auxiliary_loss_clip": 0.01159212, + "auxiliary_loss_mlp": 0.01046317, + "balance_loss_clip": 1.05432057, + "balance_loss_mlp": 1.03497982, + "epoch": 0.12409066313954187, + "flos": 20412343854720.0, + "grad_norm": 1.8873261131681616, + "language_loss": 0.82790679, + "learning_rate": 3.907947234225871e-06, + "loss": 0.84996206, + "num_input_tokens_seen": 21803260, + "step": 1032, + "time_per_iteration": 2.625009775161743 + }, + { + "auxiliary_loss_clip": 0.01160627, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.0585804, + "balance_loss_mlp": 1.02444494, + "epoch": 0.12421090603018096, + "flos": 20736688688640.0, + "grad_norm": 2.190576644275307, + "language_loss": 0.87038624, + "learning_rate": 3.907713483284352e-06, + "loss": 0.89234579, + "num_input_tokens_seen": 21822735, + "step": 1033, + "time_per_iteration": 2.6443607807159424 + }, + { + "auxiliary_loss_clip": 0.01140039, + "auxiliary_loss_mlp": 0.01044361, + "balance_loss_clip": 1.05256724, + "balance_loss_mlp": 1.03128386, + "epoch": 0.12433114892082006, + "flos": 24498834353280.0, + "grad_norm": 2.2901131175290224, + "language_loss": 0.97754049, + "learning_rate": 3.907479442944216e-06, + "loss": 0.99938446, + "num_input_tokens_seen": 21841140, + "step": 1034, + "time_per_iteration": 3.532508134841919 + }, + { + "auxiliary_loss_clip": 0.01225909, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.0662756, + "balance_loss_mlp": 1.02637112, + "epoch": 0.12445139181145914, + "flos": 19682315838720.0, + "grad_norm": 2.198523362472246, + "language_loss": 0.9256736, + "learning_rate": 3.907245113240963e-06, + "loss": 0.94829845, + "num_input_tokens_seen": 21859260, + "step": 1035, + "time_per_iteration": 2.4889605045318604 + }, + { + "auxiliary_loss_clip": 0.01192853, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.05667853, + "balance_loss_mlp": 1.0236001, + "epoch": 0.12457163470209824, + "flos": 46423087522560.0, + "grad_norm": 1.6752296986508524, + "language_loss": 0.73605829, + "learning_rate": 3.907010494210144e-06, + "loss": 0.75833815, + "num_input_tokens_seen": 21881920, + "step": 1036, + "time_per_iteration": 2.764355421066284 + }, + { + "auxiliary_loss_clip": 0.01230011, + "auxiliary_loss_mlp": 0.01046237, + "balance_loss_clip": 1.06778121, + "balance_loss_mlp": 1.03355885, + "epoch": 0.12469187759273732, + "flos": 20376289578240.0, + "grad_norm": 2.0552989901705367, + "language_loss": 0.92072618, + "learning_rate": 3.9067755858873495e-06, + "loss": 0.9434886, + "num_input_tokens_seen": 21898720, + "step": 1037, + "time_per_iteration": 2.509488344192505 + }, + { + "auxiliary_loss_clip": 0.01088206, + "auxiliary_loss_mlp": 0.01006039, + "balance_loss_clip": 1.01946926, + "balance_loss_mlp": 1.00302303, + "epoch": 0.12481212048337642, + "flos": 69224641447680.0, + "grad_norm": 0.8631232454559965, + "language_loss": 0.62778598, + "learning_rate": 3.906540388308214e-06, + "loss": 0.64872843, + "num_input_tokens_seen": 21958305, + "step": 1038, + "time_per_iteration": 3.111362934112549 + }, + { + "auxiliary_loss_clip": 0.01166583, + "auxiliary_loss_mlp": 0.01047939, + "balance_loss_clip": 1.05954885, + "balance_loss_mlp": 1.03619719, + "epoch": 0.12493236337401552, + "flos": 18223696350720.0, + "grad_norm": 1.7252077372252637, + "language_loss": 0.81430256, + "learning_rate": 3.906304901508417e-06, + "loss": 0.83644783, + "num_input_tokens_seen": 21977205, + "step": 1039, + "time_per_iteration": 3.4055750370025635 + }, + { + "auxiliary_loss_clip": 0.01230046, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_clip": 1.07025886, + "balance_loss_mlp": 1.03304839, + "epoch": 0.12505260626465461, + "flos": 30044375303040.0, + "grad_norm": 2.030673336104599, + "language_loss": 0.7545746, + "learning_rate": 3.9060691255236835e-06, + "loss": 0.77731097, + "num_input_tokens_seen": 21997770, + "step": 1040, + "time_per_iteration": 2.5736887454986572 + }, + { + "auxiliary_loss_clip": 0.01219124, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.05977941, + "balance_loss_mlp": 1.02981019, + "epoch": 0.1251728491552937, + "flos": 24433980347520.0, + "grad_norm": 1.7378624766192161, + "language_loss": 0.80589688, + "learning_rate": 3.905833060389778e-06, + "loss": 0.82850689, + "num_input_tokens_seen": 22021890, + "step": 1041, + "time_per_iteration": 4.35455584526062 + }, + { + "auxiliary_loss_clip": 0.01242913, + "auxiliary_loss_mlp": 0.00765922, + "balance_loss_clip": 1.06769443, + "balance_loss_mlp": 1.00015247, + "epoch": 0.12529309204593278, + "flos": 27119809952640.0, + "grad_norm": 2.4006404917695154, + "language_loss": 0.78460449, + "learning_rate": 3.905596706142513e-06, + "loss": 0.80469286, + "num_input_tokens_seen": 22043300, + "step": 1042, + "time_per_iteration": 2.5455574989318848 + }, + { + "auxiliary_loss_clip": 0.0119042, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.05873978, + "balance_loss_mlp": 1.03075874, + "epoch": 0.12541333493657186, + "flos": 30774151923840.0, + "grad_norm": 1.9265047748491917, + "language_loss": 0.85775608, + "learning_rate": 3.9053600628177435e-06, + "loss": 0.88008654, + "num_input_tokens_seen": 22062910, + "step": 1043, + "time_per_iteration": 2.6622276306152344 + }, + { + "auxiliary_loss_clip": 0.01240352, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.06610239, + "balance_loss_mlp": 1.02621567, + "epoch": 0.12553357782721097, + "flos": 23659566099840.0, + "grad_norm": 2.189959295462693, + "language_loss": 0.84603149, + "learning_rate": 3.905123130451367e-06, + "loss": 0.86880624, + "num_input_tokens_seen": 22084010, + "step": 1044, + "time_per_iteration": 2.6156985759735107 + }, + { + "auxiliary_loss_clip": 0.01243552, + "auxiliary_loss_mlp": 0.01038057, + "balance_loss_clip": 1.06874752, + "balance_loss_mlp": 1.02632654, + "epoch": 0.12565382071785006, + "flos": 24863758577280.0, + "grad_norm": 1.8574377599064194, + "language_loss": 0.79525554, + "learning_rate": 3.904885909079326e-06, + "loss": 0.8180716, + "num_input_tokens_seen": 22102795, + "step": 1045, + "time_per_iteration": 2.5214390754699707 + }, + { + "auxiliary_loss_clip": 0.0122688, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.06402993, + "balance_loss_mlp": 1.02630591, + "epoch": 0.12577406360848914, + "flos": 21360780518400.0, + "grad_norm": 2.1491868964460044, + "language_loss": 0.77576238, + "learning_rate": 3.904648398737607e-06, + "loss": 0.79840744, + "num_input_tokens_seen": 22121360, + "step": 1046, + "time_per_iteration": 2.5263524055480957 + }, + { + "auxiliary_loss_clip": 0.01241834, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.06716454, + "balance_loss_mlp": 1.03160596, + "epoch": 0.12589430649912825, + "flos": 36138056774400.0, + "grad_norm": 1.7367113541557497, + "language_loss": 0.78241658, + "learning_rate": 3.9044105994622406e-06, + "loss": 0.80526048, + "num_input_tokens_seen": 22142505, + "step": 1047, + "time_per_iteration": 2.6438097953796387 + }, + { + "auxiliary_loss_clip": 0.01213877, + "auxiliary_loss_mlp": 0.00766208, + "balance_loss_clip": 1.06240201, + "balance_loss_mlp": 1.00026429, + "epoch": 0.12601454938976733, + "flos": 25337671643520.0, + "grad_norm": 1.9834209488074352, + "language_loss": 0.81556469, + "learning_rate": 3.9041725112893005e-06, + "loss": 0.83536553, + "num_input_tokens_seen": 22163730, + "step": 1048, + "time_per_iteration": 2.6039910316467285 + }, + { + "auxiliary_loss_clip": 0.01191212, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.06269813, + "balance_loss_mlp": 1.02858782, + "epoch": 0.12613479228040642, + "flos": 15560094286080.0, + "grad_norm": 2.0108690803891704, + "language_loss": 0.74804461, + "learning_rate": 3.903934134254904e-06, + "loss": 0.77035326, + "num_input_tokens_seen": 22181520, + "step": 1049, + "time_per_iteration": 2.567673444747925 + }, + { + "auxiliary_loss_clip": 0.01230172, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.06453228, + "balance_loss_mlp": 1.03203142, + "epoch": 0.1262550351710455, + "flos": 21470595373440.0, + "grad_norm": 2.921216138891059, + "language_loss": 0.84975553, + "learning_rate": 3.903695468395213e-06, + "loss": 0.87249595, + "num_input_tokens_seen": 22199390, + "step": 1050, + "time_per_iteration": 2.5238101482391357 + }, + { + "auxiliary_loss_clip": 0.01213068, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.05948162, + "balance_loss_mlp": 1.03168654, + "epoch": 0.1263752780616846, + "flos": 31576719456000.0, + "grad_norm": 2.6318880292129165, + "language_loss": 0.55679047, + "learning_rate": 3.903456513746434e-06, + "loss": 0.57934421, + "num_input_tokens_seen": 22220365, + "step": 1051, + "time_per_iteration": 2.613668203353882 + }, + { + "auxiliary_loss_clip": 0.01238207, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.06540203, + "balance_loss_mlp": 1.02813554, + "epoch": 0.1264955209523237, + "flos": 28768217927040.0, + "grad_norm": 1.7206193974943775, + "language_loss": 0.87629473, + "learning_rate": 3.903217270344815e-06, + "loss": 0.89906412, + "num_input_tokens_seen": 22240615, + "step": 1052, + "time_per_iteration": 2.537637710571289 + }, + { + "auxiliary_loss_clip": 0.01184548, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.05635595, + "balance_loss_mlp": 1.02667487, + "epoch": 0.12661576384296278, + "flos": 29241125412480.0, + "grad_norm": 2.076397698030303, + "language_loss": 0.82413191, + "learning_rate": 3.902977738226648e-06, + "loss": 0.84635699, + "num_input_tokens_seen": 22261350, + "step": 1053, + "time_per_iteration": 2.6436336040496826 + }, + { + "auxiliary_loss_clip": 0.01229076, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.06581616, + "balance_loss_mlp": 1.02984357, + "epoch": 0.12673600673360189, + "flos": 20850346298880.0, + "grad_norm": 2.012519176240002, + "language_loss": 0.91379714, + "learning_rate": 3.902737917428273e-06, + "loss": 0.93650657, + "num_input_tokens_seen": 22279515, + "step": 1054, + "time_per_iteration": 2.490696668624878 + }, + { + "auxiliary_loss_clip": 0.01239093, + "auxiliary_loss_mlp": 0.01039842, + "balance_loss_clip": 1.06527889, + "balance_loss_mlp": 1.02880883, + "epoch": 0.12685624962424097, + "flos": 25263695583360.0, + "grad_norm": 1.6635908928488745, + "language_loss": 0.83977073, + "learning_rate": 3.902497807986068e-06, + "loss": 0.86256003, + "num_input_tokens_seen": 22299535, + "step": 1055, + "time_per_iteration": 2.502523422241211 + }, + { + "auxiliary_loss_clip": 0.01194806, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.05784416, + "balance_loss_mlp": 1.02744198, + "epoch": 0.12697649251488005, + "flos": 27527109246720.0, + "grad_norm": 1.5842689481459875, + "language_loss": 0.83632183, + "learning_rate": 3.902257409936458e-06, + "loss": 0.85865998, + "num_input_tokens_seen": 22320300, + "step": 1056, + "time_per_iteration": 2.641488552093506 + }, + { + "auxiliary_loss_clip": 0.01209253, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.06527722, + "balance_loss_mlp": 1.0288713, + "epoch": 0.12709673540551916, + "flos": 21251863503360.0, + "grad_norm": 1.8396098698391874, + "language_loss": 0.83933568, + "learning_rate": 3.902016723315912e-06, + "loss": 0.86182398, + "num_input_tokens_seen": 22338240, + "step": 1057, + "time_per_iteration": 2.5296690464019775 + }, + { + "auxiliary_loss_clip": 0.01221255, + "auxiliary_loss_mlp": 0.01040542, + "balance_loss_clip": 1.06176949, + "balance_loss_mlp": 1.02964067, + "epoch": 0.12721697829615825, + "flos": 25337707557120.0, + "grad_norm": 7.034821121062021, + "language_loss": 0.69578528, + "learning_rate": 3.901775748160941e-06, + "loss": 0.71840322, + "num_input_tokens_seen": 22357420, + "step": 1058, + "time_per_iteration": 2.536893606185913 + }, + { + "auxiliary_loss_clip": 0.01100003, + "auxiliary_loss_mlp": 0.01008418, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.00559258, + "epoch": 0.12733722118679733, + "flos": 61943287754880.0, + "grad_norm": 0.7970972117639246, + "language_loss": 0.60888958, + "learning_rate": 3.901534484508101e-06, + "loss": 0.62997377, + "num_input_tokens_seen": 22420095, + "step": 1059, + "time_per_iteration": 3.098818302154541 + }, + { + "auxiliary_loss_clip": 0.01200318, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.0596056, + "balance_loss_mlp": 1.0263294, + "epoch": 0.1274574640774364, + "flos": 26976742081920.0, + "grad_norm": 2.7969127552917716, + "language_loss": 0.74671823, + "learning_rate": 3.901292932393991e-06, + "loss": 0.76909685, + "num_input_tokens_seen": 22438975, + "step": 1060, + "time_per_iteration": 2.5649211406707764 + }, + { + "auxiliary_loss_clip": 0.0123987, + "auxiliary_loss_mlp": 0.01042676, + "balance_loss_clip": 1.06724727, + "balance_loss_mlp": 1.03141642, + "epoch": 0.12757770696807552, + "flos": 22236318529920.0, + "grad_norm": 2.818224049382878, + "language_loss": 0.85229158, + "learning_rate": 3.9010510918552555e-06, + "loss": 0.87511706, + "num_input_tokens_seen": 22458050, + "step": 1061, + "time_per_iteration": 3.32651424407959 + }, + { + "auxiliary_loss_clip": 0.01206446, + "auxiliary_loss_mlp": 0.01045209, + "balance_loss_clip": 1.06109846, + "balance_loss_mlp": 1.03220916, + "epoch": 0.1276979498587146, + "flos": 28547905858560.0, + "grad_norm": 3.845495566493531, + "language_loss": 0.74482942, + "learning_rate": 3.900808962928581e-06, + "loss": 0.76734602, + "num_input_tokens_seen": 22475665, + "step": 1062, + "time_per_iteration": 2.6039249897003174 + }, + { + "auxiliary_loss_clip": 0.01241533, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.069332, + "balance_loss_mlp": 1.03210807, + "epoch": 0.1278181927493537, + "flos": 17420338719360.0, + "grad_norm": 2.3556815949442447, + "language_loss": 0.89496851, + "learning_rate": 3.900566545650698e-06, + "loss": 0.91781747, + "num_input_tokens_seen": 22493335, + "step": 1063, + "time_per_iteration": 2.5018365383148193 + }, + { + "auxiliary_loss_clip": 0.01224384, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.06568766, + "balance_loss_mlp": 1.02628446, + "epoch": 0.1279384356399928, + "flos": 21138636856320.0, + "grad_norm": 2.4870396700110913, + "language_loss": 0.81683242, + "learning_rate": 3.900323840058381e-06, + "loss": 0.83946037, + "num_input_tokens_seen": 22511045, + "step": 1064, + "time_per_iteration": 2.5124664306640625 + }, + { + "auxiliary_loss_clip": 0.01222741, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.06175506, + "balance_loss_mlp": 1.02920628, + "epoch": 0.12805867853063188, + "flos": 26576733248640.0, + "grad_norm": 3.1755074166154538, + "language_loss": 0.81772017, + "learning_rate": 3.900080846188449e-06, + "loss": 0.84034383, + "num_input_tokens_seen": 22529635, + "step": 1065, + "time_per_iteration": 2.5471057891845703 + }, + { + "auxiliary_loss_clip": 0.01238892, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.06594896, + "balance_loss_mlp": 1.02464962, + "epoch": 0.12817892142127096, + "flos": 16436206915200.0, + "grad_norm": 2.080051520696307, + "language_loss": 0.8166948, + "learning_rate": 3.8998375640777625e-06, + "loss": 0.83944678, + "num_input_tokens_seen": 22547505, + "step": 1066, + "time_per_iteration": 3.271850347518921 + }, + { + "auxiliary_loss_clip": 0.01106057, + "auxiliary_loss_mlp": 0.01002177, + "balance_loss_clip": 1.03350782, + "balance_loss_mlp": 0.99933952, + "epoch": 0.12829916431191005, + "flos": 60757049099520.0, + "grad_norm": 0.7070557646598391, + "language_loss": 0.52648526, + "learning_rate": 3.899593993763229e-06, + "loss": 0.54756761, + "num_input_tokens_seen": 22608465, + "step": 1067, + "time_per_iteration": 4.689392805099487 + }, + { + "auxiliary_loss_clip": 0.01189222, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.06122744, + "balance_loss_mlp": 1.02853906, + "epoch": 0.12841940720254916, + "flos": 29786895636480.0, + "grad_norm": 4.176731565411882, + "language_loss": 0.81389064, + "learning_rate": 3.899350135281796e-06, + "loss": 0.83620095, + "num_input_tokens_seen": 22629465, + "step": 1068, + "time_per_iteration": 2.6616933345794678 + }, + { + "auxiliary_loss_clip": 0.01196689, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.06291044, + "balance_loss_mlp": 1.02799249, + "epoch": 0.12853965009318824, + "flos": 25951851319680.0, + "grad_norm": 2.0782611993186997, + "language_loss": 0.7963084, + "learning_rate": 3.8991059886704585e-06, + "loss": 0.81866372, + "num_input_tokens_seen": 22648970, + "step": 1069, + "time_per_iteration": 2.618229866027832 + }, + { + "auxiliary_loss_clip": 0.01186604, + "auxiliary_loss_mlp": 0.01045415, + "balance_loss_clip": 1.0594517, + "balance_loss_mlp": 1.03403068, + "epoch": 0.12865989298382732, + "flos": 30846871008000.0, + "grad_norm": 2.1171472038622725, + "language_loss": 0.8299666, + "learning_rate": 3.898861553966252e-06, + "loss": 0.85228682, + "num_input_tokens_seen": 22668620, + "step": 1070, + "time_per_iteration": 2.625675916671753 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0104496, + "balance_loss_clip": 1.05454755, + "balance_loss_mlp": 1.03380799, + "epoch": 0.12878013587446643, + "flos": 25885776251520.0, + "grad_norm": 1.767287465185013, + "language_loss": 0.88117641, + "learning_rate": 3.898616831206257e-06, + "loss": 0.90313202, + "num_input_tokens_seen": 22689045, + "step": 1071, + "time_per_iteration": 2.7152910232543945 + }, + { + "auxiliary_loss_clip": 0.01190213, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.05717158, + "balance_loss_mlp": 1.025841, + "epoch": 0.12890037876510552, + "flos": 23333138277120.0, + "grad_norm": 2.7753628309298186, + "language_loss": 0.76919007, + "learning_rate": 3.8983718204276e-06, + "loss": 0.79148692, + "num_input_tokens_seen": 22711265, + "step": 1072, + "time_per_iteration": 2.6214137077331543 + }, + { + "auxiliary_loss_clip": 0.01205405, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.06063318, + "balance_loss_mlp": 1.03459585, + "epoch": 0.1290206216557446, + "flos": 23587242065280.0, + "grad_norm": 1.801344787788328, + "language_loss": 0.82660186, + "learning_rate": 3.898126521667446e-06, + "loss": 0.84910643, + "num_input_tokens_seen": 22731420, + "step": 1073, + "time_per_iteration": 2.5689280033111572 + }, + { + "auxiliary_loss_clip": 0.01220513, + "auxiliary_loss_mlp": 0.01048484, + "balance_loss_clip": 1.06045818, + "balance_loss_mlp": 1.03642035, + "epoch": 0.12914086454638368, + "flos": 24170610850560.0, + "grad_norm": 1.6126434237526102, + "language_loss": 0.83289838, + "learning_rate": 3.897880934963007e-06, + "loss": 0.85558832, + "num_input_tokens_seen": 22750970, + "step": 1074, + "time_per_iteration": 2.523643732070923 + }, + { + "auxiliary_loss_clip": 0.01203398, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.05829477, + "balance_loss_mlp": 1.02604461, + "epoch": 0.1292611074370228, + "flos": 20267157081600.0, + "grad_norm": 2.45808645228896, + "language_loss": 0.7873584, + "learning_rate": 3.89763506035154e-06, + "loss": 0.80977255, + "num_input_tokens_seen": 22768820, + "step": 1075, + "time_per_iteration": 2.5363309383392334 + }, + { + "auxiliary_loss_clip": 0.01210482, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.06172299, + "balance_loss_mlp": 1.02454185, + "epoch": 0.12938135032766188, + "flos": 27377684668800.0, + "grad_norm": 3.0214997472114447, + "language_loss": 0.80906409, + "learning_rate": 3.897388897870343e-06, + "loss": 0.83152467, + "num_input_tokens_seen": 22789460, + "step": 1076, + "time_per_iteration": 2.5580832958221436 + }, + { + "auxiliary_loss_clip": 0.01220461, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.06149411, + "balance_loss_mlp": 1.02417898, + "epoch": 0.12950159321830096, + "flos": 29277107861760.0, + "grad_norm": 1.9033456538678497, + "language_loss": 0.74950051, + "learning_rate": 3.89714244755676e-06, + "loss": 0.77207083, + "num_input_tokens_seen": 22810820, + "step": 1077, + "time_per_iteration": 2.6054811477661133 + }, + { + "auxiliary_loss_clip": 0.01163658, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.05325067, + "balance_loss_mlp": 1.02897716, + "epoch": 0.12962183610894007, + "flos": 24534888629760.0, + "grad_norm": 2.5521679309733747, + "language_loss": 0.86465424, + "learning_rate": 3.896895709448175e-06, + "loss": 0.88669372, + "num_input_tokens_seen": 22830570, + "step": 1078, + "time_per_iteration": 2.6051576137542725 + }, + { + "auxiliary_loss_clip": 0.01154197, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.05152571, + "balance_loss_mlp": 1.02999401, + "epoch": 0.12974207899957915, + "flos": 11215944552960.0, + "grad_norm": 2.658614775245721, + "language_loss": 0.76757759, + "learning_rate": 3.896648683582019e-06, + "loss": 0.78952974, + "num_input_tokens_seen": 22845905, + "step": 1079, + "time_per_iteration": 2.603343963623047 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.06057978, + "balance_loss_mlp": 1.02589798, + "epoch": 0.12986232189021824, + "flos": 24717889445760.0, + "grad_norm": 2.1070772137520026, + "language_loss": 0.80655158, + "learning_rate": 3.896401369995766e-06, + "loss": 0.82867491, + "num_input_tokens_seen": 22865710, + "step": 1080, + "time_per_iteration": 2.643333673477173 + }, + { + "auxiliary_loss_clip": 0.01241174, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_clip": 1.06868219, + "balance_loss_mlp": 1.03552365, + "epoch": 0.12998256478085732, + "flos": 23915357827200.0, + "grad_norm": 1.8554135263392066, + "language_loss": 0.79311562, + "learning_rate": 3.896153768726932e-06, + "loss": 0.81599343, + "num_input_tokens_seen": 22886020, + "step": 1081, + "time_per_iteration": 2.5395965576171875 + }, + { + "auxiliary_loss_clip": 0.0122403, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.06594539, + "balance_loss_mlp": 1.02932608, + "epoch": 0.13010280767149643, + "flos": 18624207974400.0, + "grad_norm": 1.9891574188386492, + "language_loss": 0.88152111, + "learning_rate": 3.8959058798130806e-06, + "loss": 0.90416819, + "num_input_tokens_seen": 22903995, + "step": 1082, + "time_per_iteration": 2.474423408508301 + }, + { + "auxiliary_loss_clip": 0.01211475, + "auxiliary_loss_mlp": 0.00766485, + "balance_loss_clip": 1.06288767, + "balance_loss_mlp": 1.00034475, + "epoch": 0.1302230505621355, + "flos": 22783992174720.0, + "grad_norm": 1.93527029883986, + "language_loss": 0.74766552, + "learning_rate": 3.895657703291814e-06, + "loss": 0.76744515, + "num_input_tokens_seen": 22924100, + "step": 1083, + "time_per_iteration": 2.55025053024292 + }, + { + "auxiliary_loss_clip": 0.01216616, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.06058097, + "balance_loss_mlp": 1.02440393, + "epoch": 0.1303432934527746, + "flos": 21323612920320.0, + "grad_norm": 2.8579568162587927, + "language_loss": 0.7955972, + "learning_rate": 3.895409239200781e-06, + "loss": 0.81812179, + "num_input_tokens_seen": 22939985, + "step": 1084, + "time_per_iteration": 2.5215818881988525 + }, + { + "auxiliary_loss_clip": 0.0121806, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.0621314, + "balance_loss_mlp": 1.02850354, + "epoch": 0.1304635363434137, + "flos": 20922490765440.0, + "grad_norm": 2.390041997093012, + "language_loss": 0.91477942, + "learning_rate": 3.895160487577673e-06, + "loss": 0.93737376, + "num_input_tokens_seen": 22957555, + "step": 1085, + "time_per_iteration": 2.496581554412842 + }, + { + "auxiliary_loss_clip": 0.01114649, + "auxiliary_loss_mlp": 0.01003939, + "balance_loss_clip": 1.02569234, + "balance_loss_mlp": 1.00112534, + "epoch": 0.1305837792340528, + "flos": 63245659080960.0, + "grad_norm": 0.7858939093723247, + "language_loss": 0.60890037, + "learning_rate": 3.894911448460226e-06, + "loss": 0.63008624, + "num_input_tokens_seen": 23016870, + "step": 1086, + "time_per_iteration": 2.9670746326446533 + }, + { + "auxiliary_loss_clip": 0.01126529, + "auxiliary_loss_mlp": 0.01046833, + "balance_loss_clip": 1.05105567, + "balance_loss_mlp": 1.03490007, + "epoch": 0.13070402212469187, + "flos": 26428852955520.0, + "grad_norm": 1.9298341026671, + "language_loss": 0.72878098, + "learning_rate": 3.8946621218862195e-06, + "loss": 0.75051457, + "num_input_tokens_seen": 23037870, + "step": 1087, + "time_per_iteration": 3.5803439617156982 + }, + { + "auxiliary_loss_clip": 0.01189059, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.06005859, + "balance_loss_mlp": 1.03217936, + "epoch": 0.13082426501533098, + "flos": 27673409341440.0, + "grad_norm": 3.9347373992496233, + "language_loss": 0.89026272, + "learning_rate": 3.894412507893475e-06, + "loss": 0.91258407, + "num_input_tokens_seen": 23058150, + "step": 1088, + "time_per_iteration": 2.6474409103393555 + }, + { + "auxiliary_loss_clip": 0.01184951, + "auxiliary_loss_mlp": 0.0104793, + "balance_loss_clip": 1.05821359, + "balance_loss_mlp": 1.03605092, + "epoch": 0.13094450790597006, + "flos": 24826770547200.0, + "grad_norm": 2.040015778418802, + "language_loss": 0.71958792, + "learning_rate": 3.894162606519859e-06, + "loss": 0.74191678, + "num_input_tokens_seen": 23077100, + "step": 1089, + "time_per_iteration": 2.671130418777466 + }, + { + "auxiliary_loss_clip": 0.01176615, + "auxiliary_loss_mlp": 0.01041977, + "balance_loss_clip": 1.05913639, + "balance_loss_mlp": 1.03121877, + "epoch": 0.13106475079660915, + "flos": 19062605468160.0, + "grad_norm": 1.9492241161256092, + "language_loss": 0.76893365, + "learning_rate": 3.893912417803282e-06, + "loss": 0.79111958, + "num_input_tokens_seen": 23096815, + "step": 1090, + "time_per_iteration": 2.6335387229919434 + }, + { + "auxiliary_loss_clip": 0.01178566, + "auxiliary_loss_mlp": 0.01042553, + "balance_loss_clip": 1.05422139, + "balance_loss_mlp": 1.03019667, + "epoch": 0.13118499368724823, + "flos": 28913189218560.0, + "grad_norm": 2.079338919257438, + "language_loss": 0.77189744, + "learning_rate": 3.8936619417816975e-06, + "loss": 0.79410869, + "num_input_tokens_seen": 23117145, + "step": 1091, + "time_per_iteration": 2.689709424972534 + }, + { + "auxiliary_loss_clip": 0.01192748, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.06201732, + "balance_loss_mlp": 1.02257299, + "epoch": 0.13130523657788734, + "flos": 14283398206080.0, + "grad_norm": 1.7588624226971892, + "language_loss": 0.71889734, + "learning_rate": 3.8934111784931015e-06, + "loss": 0.74115902, + "num_input_tokens_seen": 23134595, + "step": 1092, + "time_per_iteration": 2.6658742427825928 + }, + { + "auxiliary_loss_clip": 0.01104425, + "auxiliary_loss_mlp": 0.01002692, + "balance_loss_clip": 1.0229907, + "balance_loss_mlp": 0.99985522, + "epoch": 0.13142547946852642, + "flos": 70174155519360.0, + "grad_norm": 0.9243659688392305, + "language_loss": 0.59138483, + "learning_rate": 3.893160127975535e-06, + "loss": 0.61245602, + "num_input_tokens_seen": 23195285, + "step": 1093, + "time_per_iteration": 3.997832775115967 + }, + { + "auxiliary_loss_clip": 0.01180875, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.05656636, + "balance_loss_mlp": 1.02686143, + "epoch": 0.1315457223591655, + "flos": 45805998844800.0, + "grad_norm": 2.248025414806439, + "language_loss": 0.81055689, + "learning_rate": 3.8929087902670826e-06, + "loss": 0.83274513, + "num_input_tokens_seen": 23216915, + "step": 1094, + "time_per_iteration": 4.473284959793091 + }, + { + "auxiliary_loss_clip": 0.01115156, + "auxiliary_loss_mlp": 0.01002481, + "balance_loss_clip": 1.02234173, + "balance_loss_mlp": 0.9998461, + "epoch": 0.13166596524980462, + "flos": 62881165820160.0, + "grad_norm": 0.9316042087994315, + "language_loss": 0.60643041, + "learning_rate": 3.8926571654058715e-06, + "loss": 0.62760675, + "num_input_tokens_seen": 23273560, + "step": 1095, + "time_per_iteration": 3.001385450363159 + }, + { + "auxiliary_loss_clip": 0.01189067, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.06040812, + "balance_loss_mlp": 1.0253458, + "epoch": 0.1317862081404437, + "flos": 23586523793280.0, + "grad_norm": 2.3690148783628695, + "language_loss": 0.77153385, + "learning_rate": 3.892405253430074e-06, + "loss": 0.79378986, + "num_input_tokens_seen": 23291080, + "step": 1096, + "time_per_iteration": 2.585012435913086 + }, + { + "auxiliary_loss_clip": 0.01211897, + "auxiliary_loss_mlp": 0.00766704, + "balance_loss_clip": 1.06427073, + "balance_loss_mlp": 1.00037026, + "epoch": 0.13190645103108278, + "flos": 20260764460800.0, + "grad_norm": 2.6380488360395065, + "language_loss": 0.82375747, + "learning_rate": 3.892153054377904e-06, + "loss": 0.84354347, + "num_input_tokens_seen": 23308485, + "step": 1097, + "time_per_iteration": 2.505908250808716 + }, + { + "auxiliary_loss_clip": 0.01053572, + "auxiliary_loss_mlp": 0.01006238, + "balance_loss_clip": 1.02045012, + "balance_loss_mlp": 1.00341308, + "epoch": 0.13202669392172187, + "flos": 53455440136320.0, + "grad_norm": 0.9438268029637948, + "language_loss": 0.59427595, + "learning_rate": 3.891900568287619e-06, + "loss": 0.61487406, + "num_input_tokens_seen": 23360870, + "step": 1098, + "time_per_iteration": 3.0244534015655518 + }, + { + "auxiliary_loss_clip": 0.01196647, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.06021881, + "balance_loss_mlp": 1.02593517, + "epoch": 0.13214693681236098, + "flos": 15851293845120.0, + "grad_norm": 2.7464925446688007, + "language_loss": 0.72030115, + "learning_rate": 3.891647795197523e-06, + "loss": 0.74264568, + "num_input_tokens_seen": 23376910, + "step": 1099, + "time_per_iteration": 2.5363311767578125 + }, + { + "auxiliary_loss_clip": 0.01199107, + "auxiliary_loss_mlp": 0.01046583, + "balance_loss_clip": 1.05809522, + "balance_loss_mlp": 1.03436375, + "epoch": 0.13226717970300006, + "flos": 19353840940800.0, + "grad_norm": 1.9702353367692258, + "language_loss": 0.68540627, + "learning_rate": 3.8913947351459605e-06, + "loss": 0.70786315, + "num_input_tokens_seen": 23394450, + "step": 1100, + "time_per_iteration": 2.570089817047119 + }, + { + "auxiliary_loss_clip": 0.01242258, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.06853461, + "balance_loss_mlp": 1.02806497, + "epoch": 0.13238742259363914, + "flos": 20698084546560.0, + "grad_norm": 1.992411835738522, + "language_loss": 0.67721313, + "learning_rate": 3.89114138817132e-06, + "loss": 0.70002329, + "num_input_tokens_seen": 23411115, + "step": 1101, + "time_per_iteration": 2.4617741107940674 + }, + { + "auxiliary_loss_clip": 0.0122472, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.06689572, + "balance_loss_mlp": 1.02222586, + "epoch": 0.13250766548427825, + "flos": 21032449274880.0, + "grad_norm": 1.7244034211692043, + "language_loss": 0.84244347, + "learning_rate": 3.890887754312035e-06, + "loss": 0.8650254, + "num_input_tokens_seen": 23429360, + "step": 1102, + "time_per_iteration": 2.5234038829803467 + }, + { + "auxiliary_loss_clip": 0.01198335, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_clip": 1.05613685, + "balance_loss_mlp": 1.03599524, + "epoch": 0.13262790837491734, + "flos": 22637871648000.0, + "grad_norm": 1.8425585182217774, + "language_loss": 0.87634182, + "learning_rate": 3.890633833606581e-06, + "loss": 0.89879954, + "num_input_tokens_seen": 23449050, + "step": 1103, + "time_per_iteration": 2.519550323486328 + }, + { + "auxiliary_loss_clip": 0.01223589, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.06723905, + "balance_loss_mlp": 1.0228045, + "epoch": 0.13274815126555642, + "flos": 19683141851520.0, + "grad_norm": 1.8864708105263763, + "language_loss": 0.69656742, + "learning_rate": 3.890379626093477e-06, + "loss": 0.71913624, + "num_input_tokens_seen": 23468800, + "step": 1104, + "time_per_iteration": 2.5130162239074707 + }, + { + "auxiliary_loss_clip": 0.01164821, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.0563271, + "balance_loss_mlp": 1.02599669, + "epoch": 0.1328683941561955, + "flos": 21317687176320.0, + "grad_norm": 3.0201325999247755, + "language_loss": 0.92674553, + "learning_rate": 3.890125131811287e-06, + "loss": 0.94877231, + "num_input_tokens_seen": 23486850, + "step": 1105, + "time_per_iteration": 2.5744683742523193 + }, + { + "auxiliary_loss_clip": 0.01193684, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.055655, + "balance_loss_mlp": 1.02708602, + "epoch": 0.1329886370468346, + "flos": 13699131580800.0, + "grad_norm": 2.0178759255710434, + "language_loss": 0.75340331, + "learning_rate": 3.889870350798618e-06, + "loss": 0.77571869, + "num_input_tokens_seen": 23504195, + "step": 1106, + "time_per_iteration": 2.5424411296844482 + }, + { + "auxiliary_loss_clip": 0.01241392, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.06606436, + "balance_loss_mlp": 1.02779841, + "epoch": 0.1331088799374737, + "flos": 21032413361280.0, + "grad_norm": 1.6954560955448363, + "language_loss": 0.78544891, + "learning_rate": 3.889615283094119e-06, + "loss": 0.80824864, + "num_input_tokens_seen": 23523385, + "step": 1107, + "time_per_iteration": 2.4959022998809814 + }, + { + "auxiliary_loss_clip": 0.01244503, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.06660509, + "balance_loss_mlp": 1.02921891, + "epoch": 0.13322912282811278, + "flos": 18260432985600.0, + "grad_norm": 2.1217730320413266, + "language_loss": 0.84691358, + "learning_rate": 3.889359928736485e-06, + "loss": 0.86977088, + "num_input_tokens_seen": 23541330, + "step": 1108, + "time_per_iteration": 2.5023608207702637 + }, + { + "auxiliary_loss_clip": 0.01201133, + "auxiliary_loss_mlp": 0.00766386, + "balance_loss_clip": 1.06272173, + "balance_loss_mlp": 1.00047123, + "epoch": 0.1333493657187519, + "flos": 24460876656000.0, + "grad_norm": 2.030399835573967, + "language_loss": 0.90990919, + "learning_rate": 3.889104287764451e-06, + "loss": 0.92958438, + "num_input_tokens_seen": 23561705, + "step": 1109, + "time_per_iteration": 2.5452005863189697 + }, + { + "auxiliary_loss_clip": 0.01208617, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.0652591, + "balance_loss_mlp": 1.03161192, + "epoch": 0.13346960860939097, + "flos": 22158930677760.0, + "grad_norm": 2.0471571648656246, + "language_loss": 0.90363342, + "learning_rate": 3.888848360216798e-06, + "loss": 0.92614532, + "num_input_tokens_seen": 23579350, + "step": 1110, + "time_per_iteration": 2.536431312561035 + }, + { + "auxiliary_loss_clip": 0.01103863, + "auxiliary_loss_mlp": 0.01004289, + "balance_loss_clip": 1.02094758, + "balance_loss_mlp": 1.00163019, + "epoch": 0.13358985150003005, + "flos": 67931212608000.0, + "grad_norm": 0.7998990758872707, + "language_loss": 0.5653863, + "learning_rate": 3.888592146132351e-06, + "loss": 0.58646786, + "num_input_tokens_seen": 23640620, + "step": 1111, + "time_per_iteration": 3.199981689453125 + }, + { + "auxiliary_loss_clip": 0.01224742, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.06680393, + "balance_loss_mlp": 1.03215134, + "epoch": 0.13371009439066917, + "flos": 26834284742400.0, + "grad_norm": 2.09199863814541, + "language_loss": 0.78491414, + "learning_rate": 3.888335645549978e-06, + "loss": 0.80759543, + "num_input_tokens_seen": 23661040, + "step": 1112, + "time_per_iteration": 2.5374910831451416 + }, + { + "auxiliary_loss_clip": 0.01242404, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.07057321, + "balance_loss_mlp": 1.03442645, + "epoch": 0.13383033728130825, + "flos": 26322844942080.0, + "grad_norm": 2.8271973731830724, + "language_loss": 0.81292695, + "learning_rate": 3.888078858508588e-06, + "loss": 0.83580691, + "num_input_tokens_seen": 23680900, + "step": 1113, + "time_per_iteration": 2.512892246246338 + }, + { + "auxiliary_loss_clip": 0.01209126, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.06670332, + "balance_loss_mlp": 1.02578866, + "epoch": 0.13395058017194733, + "flos": 22563931501440.0, + "grad_norm": 1.9607594972561997, + "language_loss": 0.84491694, + "learning_rate": 3.8878217850471365e-06, + "loss": 0.86737925, + "num_input_tokens_seen": 23700815, + "step": 1114, + "time_per_iteration": 3.590632200241089 + }, + { + "auxiliary_loss_clip": 0.01244709, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.07039833, + "balance_loss_mlp": 1.03270042, + "epoch": 0.13407082306258641, + "flos": 25810938264960.0, + "grad_norm": 1.7815688823615061, + "language_loss": 0.74153411, + "learning_rate": 3.887564425204621e-06, + "loss": 0.76443231, + "num_input_tokens_seen": 23722500, + "step": 1115, + "time_per_iteration": 2.5157196521759033 + }, + { + "auxiliary_loss_clip": 0.01082572, + "auxiliary_loss_mlp": 0.01002906, + "balance_loss_clip": 1.02276957, + "balance_loss_mlp": 1.00024784, + "epoch": 0.13419106595322552, + "flos": 68338365269760.0, + "grad_norm": 0.8454408615764806, + "language_loss": 0.54641867, + "learning_rate": 3.887306779020083e-06, + "loss": 0.56727344, + "num_input_tokens_seen": 23777155, + "step": 1116, + "time_per_iteration": 3.055781602859497 + }, + { + "auxiliary_loss_clip": 0.01229169, + "auxiliary_loss_mlp": 0.01043548, + "balance_loss_clip": 1.06806564, + "balance_loss_mlp": 1.03185344, + "epoch": 0.1343113088438646, + "flos": 20449080489600.0, + "grad_norm": 2.609570743827039, + "language_loss": 0.7029599, + "learning_rate": 3.887048846532608e-06, + "loss": 0.72568709, + "num_input_tokens_seen": 23794130, + "step": 1117, + "time_per_iteration": 2.49936842918396 + }, + { + "auxiliary_loss_clip": 0.0108691, + "auxiliary_loss_mlp": 0.01003492, + "balance_loss_clip": 1.01915431, + "balance_loss_mlp": 1.00084519, + "epoch": 0.1344315517345037, + "flos": 67389784951680.0, + "grad_norm": 0.7589417508545027, + "language_loss": 0.58111024, + "learning_rate": 3.8867906277813224e-06, + "loss": 0.6020143, + "num_input_tokens_seen": 23852285, + "step": 1118, + "time_per_iteration": 3.0013554096221924 + }, + { + "auxiliary_loss_clip": 0.01226343, + "auxiliary_loss_mlp": 0.00766284, + "balance_loss_clip": 1.06450415, + "balance_loss_mlp": 1.00029206, + "epoch": 0.1345517946251428, + "flos": 40734442788480.0, + "grad_norm": 3.41867014912935, + "language_loss": 0.74377131, + "learning_rate": 3.886532122805399e-06, + "loss": 0.76369756, + "num_input_tokens_seen": 23874765, + "step": 1119, + "time_per_iteration": 3.567920207977295 + }, + { + "auxiliary_loss_clip": 0.0114868, + "auxiliary_loss_mlp": 0.01045096, + "balance_loss_clip": 1.05268717, + "balance_loss_mlp": 1.03314567, + "epoch": 0.13467203751578188, + "flos": 22816850140800.0, + "grad_norm": 1.6925202653526292, + "language_loss": 0.89782441, + "learning_rate": 3.886273331644053e-06, + "loss": 0.91976219, + "num_input_tokens_seen": 23893635, + "step": 1120, + "time_per_iteration": 3.566516637802124 + }, + { + "auxiliary_loss_clip": 0.01174579, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.06019986, + "balance_loss_mlp": 1.02468705, + "epoch": 0.13479228040642097, + "flos": 17091576512640.0, + "grad_norm": 2.018650031964999, + "language_loss": 0.82421106, + "learning_rate": 3.886014254336542e-06, + "loss": 0.84631538, + "num_input_tokens_seen": 23910110, + "step": 1121, + "time_per_iteration": 2.569540500640869 + }, + { + "auxiliary_loss_clip": 0.0122203, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.06409669, + "balance_loss_mlp": 1.02470994, + "epoch": 0.13491252329706005, + "flos": 23730525417600.0, + "grad_norm": 1.810207558557642, + "language_loss": 0.92445028, + "learning_rate": 3.885754890922168e-06, + "loss": 0.94702762, + "num_input_tokens_seen": 23930440, + "step": 1122, + "time_per_iteration": 2.5304903984069824 + }, + { + "auxiliary_loss_clip": 0.01131994, + "auxiliary_loss_mlp": 0.01047555, + "balance_loss_clip": 1.05270624, + "balance_loss_mlp": 1.03595567, + "epoch": 0.13503276618769916, + "flos": 34127058960000.0, + "grad_norm": 1.9392186077251008, + "language_loss": 0.78558898, + "learning_rate": 3.885495241440277e-06, + "loss": 0.80738449, + "num_input_tokens_seen": 23954535, + "step": 1123, + "time_per_iteration": 2.7606773376464844 + }, + { + "auxiliary_loss_clip": 0.01242893, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.06803179, + "balance_loss_mlp": 1.03191471, + "epoch": 0.13515300907833824, + "flos": 17712328377600.0, + "grad_norm": 1.8355905977515146, + "language_loss": 0.74198043, + "learning_rate": 3.885235305930257e-06, + "loss": 0.7648412, + "num_input_tokens_seen": 23972735, + "step": 1124, + "time_per_iteration": 2.4871087074279785 + }, + { + "auxiliary_loss_clip": 0.01190885, + "auxiliary_loss_mlp": 0.01048269, + "balance_loss_clip": 1.06436086, + "balance_loss_mlp": 1.0352211, + "epoch": 0.13527325196897733, + "flos": 20260872201600.0, + "grad_norm": 1.847931973002981, + "language_loss": 0.85501164, + "learning_rate": 3.884975084431539e-06, + "loss": 0.87740314, + "num_input_tokens_seen": 23987685, + "step": 1125, + "time_per_iteration": 2.5507395267486572 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.00766433, + "balance_loss_clip": 1.06335557, + "balance_loss_mlp": 1.00042605, + "epoch": 0.13539349485961644, + "flos": 18186492839040.0, + "grad_norm": 2.2416136256575228, + "language_loss": 0.91655236, + "learning_rate": 3.8847145769836e-06, + "loss": 0.93637091, + "num_input_tokens_seen": 24004105, + "step": 1126, + "time_per_iteration": 2.515225648880005 + }, + { + "auxiliary_loss_clip": 0.01242819, + "auxiliary_loss_mlp": 0.01041202, + "balance_loss_clip": 1.06684661, + "balance_loss_mlp": 1.02956116, + "epoch": 0.13551373775025552, + "flos": 19317463441920.0, + "grad_norm": 2.386962710680505, + "language_loss": 0.66415197, + "learning_rate": 3.884453783625959e-06, + "loss": 0.68699217, + "num_input_tokens_seen": 24021715, + "step": 1127, + "time_per_iteration": 2.482673168182373 + }, + { + "auxiliary_loss_clip": 0.01203213, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.06316209, + "balance_loss_mlp": 1.02518249, + "epoch": 0.1356339806408946, + "flos": 20850813175680.0, + "grad_norm": 2.1108926415072697, + "language_loss": 0.84781194, + "learning_rate": 3.884192704398176e-06, + "loss": 0.87020141, + "num_input_tokens_seen": 24038915, + "step": 1128, + "time_per_iteration": 2.531822443008423 + }, + { + "auxiliary_loss_clip": 0.012242, + "auxiliary_loss_mlp": 0.01051782, + "balance_loss_clip": 1.06316972, + "balance_loss_mlp": 1.04063559, + "epoch": 0.13575422353153369, + "flos": 50476037696640.0, + "grad_norm": 1.6739670975124537, + "language_loss": 0.74570715, + "learning_rate": 3.883931339339858e-06, + "loss": 0.76846701, + "num_input_tokens_seen": 24063300, + "step": 1129, + "time_per_iteration": 2.761939287185669 + }, + { + "auxiliary_loss_clip": 0.01227772, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.06450844, + "balance_loss_mlp": 1.02742577, + "epoch": 0.1358744664221728, + "flos": 18150797698560.0, + "grad_norm": 2.4110153977600044, + "language_loss": 0.78667426, + "learning_rate": 3.883669688490654e-06, + "loss": 0.8093437, + "num_input_tokens_seen": 24081070, + "step": 1130, + "time_per_iteration": 2.5068349838256836 + }, + { + "auxiliary_loss_clip": 0.01195893, + "auxiliary_loss_mlp": 0.00766058, + "balance_loss_clip": 1.05866027, + "balance_loss_mlp": 1.00041699, + "epoch": 0.13599470931281188, + "flos": 18442966924800.0, + "grad_norm": 1.9424638081951011, + "language_loss": 0.85507464, + "learning_rate": 3.883407751890256e-06, + "loss": 0.87469411, + "num_input_tokens_seen": 24099675, + "step": 1131, + "time_per_iteration": 2.5076582431793213 + }, + { + "auxiliary_loss_clip": 0.01191876, + "auxiliary_loss_mlp": 0.01048915, + "balance_loss_clip": 1.05910599, + "balance_loss_mlp": 1.03619492, + "epoch": 0.13611495220345096, + "flos": 26680766014080.0, + "grad_norm": 1.7181343736842447, + "language_loss": 0.85642576, + "learning_rate": 3.8831455295783994e-06, + "loss": 0.87883371, + "num_input_tokens_seen": 24118925, + "step": 1132, + "time_per_iteration": 2.627047061920166 + }, + { + "auxiliary_loss_clip": 0.01203861, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.06280267, + "balance_loss_mlp": 1.02939045, + "epoch": 0.13623519509409007, + "flos": 21686238673920.0, + "grad_norm": 1.643416965574082, + "language_loss": 0.73959291, + "learning_rate": 3.882883021594864e-06, + "loss": 0.76204079, + "num_input_tokens_seen": 24137065, + "step": 1133, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.01181733, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.06068063, + "balance_loss_mlp": 1.02583218, + "epoch": 0.13635543798472916, + "flos": 14830389492480.0, + "grad_norm": 2.4077745151837826, + "language_loss": 0.86901402, + "learning_rate": 3.8826202279794705e-06, + "loss": 0.89120007, + "num_input_tokens_seen": 24154125, + "step": 1134, + "time_per_iteration": 2.5350449085235596 + }, + { + "auxiliary_loss_clip": 0.0124275, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.06911612, + "balance_loss_mlp": 1.0288341, + "epoch": 0.13647568087536824, + "flos": 22890323410560.0, + "grad_norm": 1.9634094323757916, + "language_loss": 0.70318204, + "learning_rate": 3.882357148772085e-06, + "loss": 0.72600693, + "num_input_tokens_seen": 24171550, + "step": 1135, + "time_per_iteration": 2.4762027263641357 + }, + { + "auxiliary_loss_clip": 0.01175443, + "auxiliary_loss_mlp": 0.01045917, + "balance_loss_clip": 1.05802798, + "balance_loss_mlp": 1.03416872, + "epoch": 0.13659592376600732, + "flos": 19937927998080.0, + "grad_norm": 2.7922594386765605, + "language_loss": 0.84492689, + "learning_rate": 3.882093784012617e-06, + "loss": 0.86714047, + "num_input_tokens_seen": 24190190, + "step": 1136, + "time_per_iteration": 2.534083127975464 + }, + { + "auxiliary_loss_clip": 0.01205692, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.06324697, + "balance_loss_mlp": 1.02511835, + "epoch": 0.13671616665664643, + "flos": 21428579439360.0, + "grad_norm": 1.9171071513542501, + "language_loss": 0.84301311, + "learning_rate": 3.881830133741019e-06, + "loss": 0.86543584, + "num_input_tokens_seen": 24209055, + "step": 1137, + "time_per_iteration": 2.5401196479797363 + }, + { + "auxiliary_loss_clip": 0.01190995, + "auxiliary_loss_mlp": 0.010478, + "balance_loss_clip": 1.06460023, + "balance_loss_mlp": 1.03609347, + "epoch": 0.13683640954728551, + "flos": 22778138257920.0, + "grad_norm": 1.9410813736814505, + "language_loss": 0.76070166, + "learning_rate": 3.881566197997285e-06, + "loss": 0.78308958, + "num_input_tokens_seen": 24225490, + "step": 1138, + "time_per_iteration": 2.585272789001465 + }, + { + "auxiliary_loss_clip": 0.01204776, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.06676984, + "balance_loss_mlp": 1.02851439, + "epoch": 0.1369566524379246, + "flos": 21725884310400.0, + "grad_norm": 1.5339118104084037, + "language_loss": 0.75060034, + "learning_rate": 3.881301976821456e-06, + "loss": 0.77304322, + "num_input_tokens_seen": 24245520, + "step": 1139, + "time_per_iteration": 2.5421037673950195 + }, + { + "auxiliary_loss_clip": 0.0122006, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.06502652, + "balance_loss_mlp": 1.03129435, + "epoch": 0.1370768953285637, + "flos": 18624459369600.0, + "grad_norm": 1.8970164879730396, + "language_loss": 0.90650415, + "learning_rate": 3.881037470253612e-06, + "loss": 0.92912388, + "num_input_tokens_seen": 24265035, + "step": 1140, + "time_per_iteration": 3.3628947734832764 + }, + { + "auxiliary_loss_clip": 0.01175835, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.06009603, + "balance_loss_mlp": 1.02960598, + "epoch": 0.1371971382192028, + "flos": 14939521989120.0, + "grad_norm": 2.6303102786651906, + "language_loss": 0.79046839, + "learning_rate": 3.88077267833388e-06, + "loss": 0.81263113, + "num_input_tokens_seen": 24281550, + "step": 1141, + "time_per_iteration": 2.544905424118042 + }, + { + "auxiliary_loss_clip": 0.0117028, + "auxiliary_loss_mlp": 0.01044457, + "balance_loss_clip": 1.05718565, + "balance_loss_mlp": 1.0331862, + "epoch": 0.13731738110984187, + "flos": 19023785844480.0, + "grad_norm": 2.096575567553329, + "language_loss": 0.83701593, + "learning_rate": 3.880507601102427e-06, + "loss": 0.85916328, + "num_input_tokens_seen": 24299485, + "step": 1142, + "time_per_iteration": 2.6105563640594482 + }, + { + "auxiliary_loss_clip": 0.01239174, + "auxiliary_loss_mlp": 0.01047938, + "balance_loss_clip": 1.06891823, + "balance_loss_mlp": 1.03702462, + "epoch": 0.13743762400048098, + "flos": 18187462506240.0, + "grad_norm": 2.8469880353884602, + "language_loss": 0.82199681, + "learning_rate": 3.880242238599467e-06, + "loss": 0.84486794, + "num_input_tokens_seen": 24316010, + "step": 1143, + "time_per_iteration": 2.445014715194702 + }, + { + "auxiliary_loss_clip": 0.01234912, + "auxiliary_loss_mlp": 0.01047229, + "balance_loss_clip": 1.06560016, + "balance_loss_mlp": 1.03576708, + "epoch": 0.13755786689112007, + "flos": 21031982398080.0, + "grad_norm": 1.8033794513004204, + "language_loss": 0.83189905, + "learning_rate": 3.879976590865254e-06, + "loss": 0.85472047, + "num_input_tokens_seen": 24335465, + "step": 1144, + "time_per_iteration": 2.5212836265563965 + }, + { + "auxiliary_loss_clip": 0.01207388, + "auxiliary_loss_mlp": 0.01045335, + "balance_loss_clip": 1.06589866, + "balance_loss_mlp": 1.03406358, + "epoch": 0.13767810978175915, + "flos": 21360636864000.0, + "grad_norm": 1.8640830388365308, + "language_loss": 0.87317169, + "learning_rate": 3.879710657940087e-06, + "loss": 0.8956989, + "num_input_tokens_seen": 24354415, + "step": 1145, + "time_per_iteration": 2.547109842300415 + }, + { + "auxiliary_loss_clip": 0.01226205, + "auxiliary_loss_mlp": 0.01054492, + "balance_loss_clip": 1.06520343, + "balance_loss_mlp": 1.04227889, + "epoch": 0.13779835267239823, + "flos": 30592084861440.0, + "grad_norm": 2.015525290913156, + "language_loss": 0.70455736, + "learning_rate": 3.879444439864308e-06, + "loss": 0.7273643, + "num_input_tokens_seen": 24373990, + "step": 1146, + "time_per_iteration": 3.3156747817993164 + }, + { + "auxiliary_loss_clip": 0.01221469, + "auxiliary_loss_mlp": 0.00766313, + "balance_loss_clip": 1.06280899, + "balance_loss_mlp": 1.00066495, + "epoch": 0.13791859556303734, + "flos": 22669867687680.0, + "grad_norm": 1.9964562402872512, + "language_loss": 0.86180943, + "learning_rate": 3.879177936678301e-06, + "loss": 0.88168716, + "num_input_tokens_seen": 24392995, + "step": 1147, + "time_per_iteration": 4.178260564804077 + }, + { + "auxiliary_loss_clip": 0.01211583, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.06459701, + "balance_loss_mlp": 1.03132725, + "epoch": 0.13803883845367643, + "flos": 35224166016000.0, + "grad_norm": 2.2988233421089848, + "language_loss": 0.76987356, + "learning_rate": 3.878911148422496e-06, + "loss": 0.79241979, + "num_input_tokens_seen": 24414470, + "step": 1148, + "time_per_iteration": 2.643392324447632 + }, + { + "auxiliary_loss_clip": 0.01222322, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.06442523, + "balance_loss_mlp": 1.02956891, + "epoch": 0.1381590813443155, + "flos": 32014542332160.0, + "grad_norm": 2.189990400431333, + "language_loss": 0.70677912, + "learning_rate": 3.878644075137364e-06, + "loss": 0.72941506, + "num_input_tokens_seen": 24435120, + "step": 1149, + "time_per_iteration": 2.5886855125427246 + }, + { + "auxiliary_loss_clip": 0.0116837, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.0549655, + "balance_loss_mlp": 1.02666509, + "epoch": 0.13827932423495462, + "flos": 17821855923840.0, + "grad_norm": 2.1550576330570514, + "language_loss": 0.79334706, + "learning_rate": 3.878376716863418e-06, + "loss": 0.81540769, + "num_input_tokens_seen": 24451420, + "step": 1150, + "time_per_iteration": 2.51723051071167 + }, + { + "auxiliary_loss_clip": 0.01202777, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_clip": 1.05982995, + "balance_loss_mlp": 1.03306866, + "epoch": 0.1383995671255937, + "flos": 19427098728960.0, + "grad_norm": 5.871303816597254, + "language_loss": 0.71790552, + "learning_rate": 3.878109073641219e-06, + "loss": 0.74038422, + "num_input_tokens_seen": 24470450, + "step": 1151, + "time_per_iteration": 2.5488815307617188 + }, + { + "auxiliary_loss_clip": 0.01172043, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.05994558, + "balance_loss_mlp": 1.03064334, + "epoch": 0.13851981001623279, + "flos": 28296603331200.0, + "grad_norm": 1.5508358547550867, + "language_loss": 0.81265759, + "learning_rate": 3.877841145511366e-06, + "loss": 0.834795, + "num_input_tokens_seen": 24493190, + "step": 1152, + "time_per_iteration": 2.6709396839141846 + }, + { + "auxiliary_loss_clip": 0.01226703, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.06567085, + "balance_loss_mlp": 1.03045535, + "epoch": 0.13864005290687187, + "flos": 21213079793280.0, + "grad_norm": 1.8762178696647758, + "language_loss": 0.82642603, + "learning_rate": 3.8775729325145035e-06, + "loss": 0.84911251, + "num_input_tokens_seen": 24512425, + "step": 1153, + "time_per_iteration": 2.5234885215759277 + }, + { + "auxiliary_loss_clip": 0.01073429, + "auxiliary_loss_mlp": 0.01012187, + "balance_loss_clip": 1.01950455, + "balance_loss_mlp": 1.00952899, + "epoch": 0.13876029579751098, + "flos": 71653389413760.0, + "grad_norm": 0.7938055868828094, + "language_loss": 0.6472044, + "learning_rate": 3.877304434691321e-06, + "loss": 0.66806054, + "num_input_tokens_seen": 24579275, + "step": 1154, + "time_per_iteration": 3.2339136600494385 + }, + { + "auxiliary_loss_clip": 0.01188929, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.06273544, + "balance_loss_mlp": 1.02256095, + "epoch": 0.13888053868815006, + "flos": 21941348042880.0, + "grad_norm": 1.6842712277662588, + "language_loss": 0.79914737, + "learning_rate": 3.877035652082548e-06, + "loss": 0.82136476, + "num_input_tokens_seen": 24598720, + "step": 1155, + "time_per_iteration": 2.562546491622925 + }, + { + "auxiliary_loss_clip": 0.01196628, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.06213915, + "balance_loss_mlp": 1.02471614, + "epoch": 0.13900078157878915, + "flos": 19608627087360.0, + "grad_norm": 1.9114433231332668, + "language_loss": 0.85481822, + "learning_rate": 3.87676658472896e-06, + "loss": 0.87714732, + "num_input_tokens_seen": 24617530, + "step": 1156, + "time_per_iteration": 2.5305514335632324 + }, + { + "auxiliary_loss_clip": 0.01220095, + "auxiliary_loss_mlp": 0.0104503, + "balance_loss_clip": 1.0616771, + "balance_loss_mlp": 1.03439069, + "epoch": 0.13912102446942826, + "flos": 22638051216000.0, + "grad_norm": 2.0059744219112257, + "language_loss": 0.85106218, + "learning_rate": 3.876497232671372e-06, + "loss": 0.87371337, + "num_input_tokens_seen": 24637485, + "step": 1157, + "time_per_iteration": 2.489564895629883 + }, + { + "auxiliary_loss_clip": 0.0117955, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.05941677, + "balance_loss_mlp": 1.02901089, + "epoch": 0.13924126736006734, + "flos": 29643324975360.0, + "grad_norm": 1.9446276015636033, + "language_loss": 0.83603609, + "learning_rate": 3.876227595950647e-06, + "loss": 0.8582269, + "num_input_tokens_seen": 24656915, + "step": 1158, + "time_per_iteration": 2.6752874851226807 + }, + { + "auxiliary_loss_clip": 0.01238211, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.06763482, + "balance_loss_mlp": 1.02937031, + "epoch": 0.13936151025070642, + "flos": 27417653527680.0, + "grad_norm": 1.6112974470249788, + "language_loss": 0.79248452, + "learning_rate": 3.875957674607686e-06, + "loss": 0.81528056, + "num_input_tokens_seen": 24679190, + "step": 1159, + "time_per_iteration": 2.5316503047943115 + }, + { + "auxiliary_loss_clip": 0.01212441, + "auxiliary_loss_mlp": 0.00766559, + "balance_loss_clip": 1.05945969, + "balance_loss_mlp": 1.00070596, + "epoch": 0.1394817531413455, + "flos": 16399326625920.0, + "grad_norm": 2.745152521557462, + "language_loss": 0.88152218, + "learning_rate": 3.8756874686834386e-06, + "loss": 0.90131223, + "num_input_tokens_seen": 24697405, + "step": 1160, + "time_per_iteration": 2.4753894805908203 + }, + { + "auxiliary_loss_clip": 0.01224718, + "auxiliary_loss_mlp": 0.00766718, + "balance_loss_clip": 1.06229985, + "balance_loss_mlp": 1.00059319, + "epoch": 0.13960199603198462, + "flos": 30922319525760.0, + "grad_norm": 1.5296851091031625, + "language_loss": 0.80769718, + "learning_rate": 3.875416978218893e-06, + "loss": 0.82761157, + "num_input_tokens_seen": 24720600, + "step": 1161, + "time_per_iteration": 2.580864191055298 + }, + { + "auxiliary_loss_clip": 0.01198174, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.05661917, + "balance_loss_mlp": 1.0294807, + "epoch": 0.1397222389226237, + "flos": 18113773754880.0, + "grad_norm": 13.512801540782924, + "language_loss": 0.82489759, + "learning_rate": 3.8751462032550835e-06, + "loss": 0.84728849, + "num_input_tokens_seen": 24737605, + "step": 1162, + "time_per_iteration": 2.534541606903076 + }, + { + "auxiliary_loss_clip": 0.01202995, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.06549644, + "balance_loss_mlp": 1.01723433, + "epoch": 0.13984248181326278, + "flos": 16872772815360.0, + "grad_norm": 2.5041874374000392, + "language_loss": 0.82664227, + "learning_rate": 3.874875143833085e-06, + "loss": 0.84895217, + "num_input_tokens_seen": 24755845, + "step": 1163, + "time_per_iteration": 2.5094411373138428 + }, + { + "auxiliary_loss_clip": 0.0122095, + "auxiliary_loss_mlp": 0.01047486, + "balance_loss_clip": 1.06333816, + "balance_loss_mlp": 1.03568423, + "epoch": 0.1399627247039019, + "flos": 54121401267840.0, + "grad_norm": 1.809441658863343, + "language_loss": 0.68863541, + "learning_rate": 3.874603799994019e-06, + "loss": 0.7113198, + "num_input_tokens_seen": 24779380, + "step": 1164, + "time_per_iteration": 2.797175168991089 + }, + { + "auxiliary_loss_clip": 0.01182786, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.05867195, + "balance_loss_mlp": 1.02638566, + "epoch": 0.14008296759454097, + "flos": 11765521618560.0, + "grad_norm": 2.0568116690227636, + "language_loss": 0.87022305, + "learning_rate": 3.874332171779046e-06, + "loss": 0.89241767, + "num_input_tokens_seen": 24794260, + "step": 1165, + "time_per_iteration": 2.568225145339966 + }, + { + "auxiliary_loss_clip": 0.0118308, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.05651498, + "balance_loss_mlp": 1.02255273, + "epoch": 0.14020321048518006, + "flos": 22017514832640.0, + "grad_norm": 1.6720728256415833, + "language_loss": 0.75627828, + "learning_rate": 3.874060259229373e-06, + "loss": 0.77844369, + "num_input_tokens_seen": 24815835, + "step": 1166, + "time_per_iteration": 2.6282758712768555 + }, + { + "auxiliary_loss_clip": 0.01224605, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.06599569, + "balance_loss_mlp": 1.03320432, + "epoch": 0.14032345337581917, + "flos": 23404313076480.0, + "grad_norm": 2.1005222844913387, + "language_loss": 0.93716538, + "learning_rate": 3.873788062386249e-06, + "loss": 0.9598608, + "num_input_tokens_seen": 24834095, + "step": 1167, + "time_per_iteration": 3.344567060470581 + }, + { + "auxiliary_loss_clip": 0.01193847, + "auxiliary_loss_mlp": 0.01042578, + "balance_loss_clip": 1.06391454, + "balance_loss_mlp": 1.03195691, + "epoch": 0.14044369626645825, + "flos": 29645767100160.0, + "grad_norm": 1.9557510903710569, + "language_loss": 0.82077014, + "learning_rate": 3.873515581290965e-06, + "loss": 0.8431344, + "num_input_tokens_seen": 24858900, + "step": 1168, + "time_per_iteration": 2.672848701477051 + }, + { + "auxiliary_loss_clip": 0.01191536, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.0639782, + "balance_loss_mlp": 1.02794576, + "epoch": 0.14056393915709733, + "flos": 18332972501760.0, + "grad_norm": 10.310937155097955, + "language_loss": 0.75927639, + "learning_rate": 3.8732428159848575e-06, + "loss": 0.78158462, + "num_input_tokens_seen": 24877875, + "step": 1169, + "time_per_iteration": 2.574352502822876 + }, + { + "auxiliary_loss_clip": 0.01222151, + "auxiliary_loss_mlp": 0.0103911, + "balance_loss_clip": 1.0677588, + "balance_loss_mlp": 1.02782106, + "epoch": 0.14068418204773642, + "flos": 26687517770880.0, + "grad_norm": 1.914566701977521, + "language_loss": 0.78181374, + "learning_rate": 3.872969766509304e-06, + "loss": 0.80442631, + "num_input_tokens_seen": 24898430, + "step": 1170, + "time_per_iteration": 2.5370090007781982 + }, + { + "auxiliary_loss_clip": 0.01078067, + "auxiliary_loss_mlp": 0.01003821, + "balance_loss_clip": 1.02103245, + "balance_loss_mlp": 1.00079286, + "epoch": 0.14080442493837553, + "flos": 65259314501760.0, + "grad_norm": 0.7658627073932307, + "language_loss": 0.55671537, + "learning_rate": 3.872696432905726e-06, + "loss": 0.5775342, + "num_input_tokens_seen": 24959250, + "step": 1171, + "time_per_iteration": 3.1267569065093994 + }, + { + "auxiliary_loss_clip": 0.01222473, + "auxiliary_loss_mlp": 0.01043889, + "balance_loss_clip": 1.06142688, + "balance_loss_mlp": 1.03257596, + "epoch": 0.1409246678290146, + "flos": 25776715582080.0, + "grad_norm": 5.546998461052881, + "language_loss": 0.71459758, + "learning_rate": 3.872422815215589e-06, + "loss": 0.73726118, + "num_input_tokens_seen": 24978330, + "step": 1172, + "time_per_iteration": 2.5472028255462646 + }, + { + "auxiliary_loss_clip": 0.01215745, + "auxiliary_loss_mlp": 0.01044703, + "balance_loss_clip": 1.05991089, + "balance_loss_mlp": 1.03180456, + "epoch": 0.1410449107196537, + "flos": 21868521217920.0, + "grad_norm": 1.7228580108787452, + "language_loss": 0.74132156, + "learning_rate": 3.8721489134803994e-06, + "loss": 0.76392603, + "num_input_tokens_seen": 24997120, + "step": 1173, + "time_per_iteration": 3.3893330097198486 + }, + { + "auxiliary_loss_clip": 0.01218674, + "auxiliary_loss_mlp": 0.0104568, + "balance_loss_clip": 1.06441677, + "balance_loss_mlp": 1.0335927, + "epoch": 0.1411651536102928, + "flos": 16684133564160.0, + "grad_norm": 2.4506299516665164, + "language_loss": 0.72398233, + "learning_rate": 3.871874727741707e-06, + "loss": 0.74662584, + "num_input_tokens_seen": 25014350, + "step": 1174, + "time_per_iteration": 4.179301023483276 + }, + { + "auxiliary_loss_clip": 0.0121732, + "auxiliary_loss_mlp": 0.01039841, + "balance_loss_clip": 1.06673551, + "balance_loss_mlp": 1.02944636, + "epoch": 0.1412853965009319, + "flos": 20992264934400.0, + "grad_norm": 2.0172332081482733, + "language_loss": 0.96609676, + "learning_rate": 3.871600258041108e-06, + "loss": 0.98866832, + "num_input_tokens_seen": 25033875, + "step": 1175, + "time_per_iteration": 2.601478099822998 + }, + { + "auxiliary_loss_clip": 0.01201002, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.0600369, + "balance_loss_mlp": 1.02734923, + "epoch": 0.14140563939157097, + "flos": 20335279224960.0, + "grad_norm": 3.268061804043667, + "language_loss": 0.85863793, + "learning_rate": 3.871325504420238e-06, + "loss": 0.88104135, + "num_input_tokens_seen": 25052865, + "step": 1176, + "time_per_iteration": 2.562810182571411 + }, + { + "auxiliary_loss_clip": 0.01236416, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.06784892, + "balance_loss_mlp": 1.02533817, + "epoch": 0.14152588228221005, + "flos": 21068826773760.0, + "grad_norm": 1.7145377081477498, + "language_loss": 0.81782746, + "learning_rate": 3.871050466920776e-06, + "loss": 0.8405509, + "num_input_tokens_seen": 25072770, + "step": 1177, + "time_per_iteration": 2.502150058746338 + }, + { + "auxiliary_loss_clip": 0.01179289, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.05759549, + "balance_loss_mlp": 1.02102399, + "epoch": 0.14164612517284916, + "flos": 18223157646720.0, + "grad_norm": 1.8022537104679055, + "language_loss": 0.79641569, + "learning_rate": 3.870775145584447e-06, + "loss": 0.81852233, + "num_input_tokens_seen": 25090550, + "step": 1178, + "time_per_iteration": 2.632185459136963 + }, + { + "auxiliary_loss_clip": 0.01212098, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.06302476, + "balance_loss_mlp": 1.0333184, + "epoch": 0.14176636806348825, + "flos": 22744454279040.0, + "grad_norm": 2.808521870278306, + "language_loss": 0.64842236, + "learning_rate": 3.8704995404530145e-06, + "loss": 0.67099285, + "num_input_tokens_seen": 25106175, + "step": 1179, + "time_per_iteration": 2.6408607959747314 + }, + { + "auxiliary_loss_clip": 0.01233894, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.06740737, + "balance_loss_mlp": 1.02773619, + "epoch": 0.14188661095412733, + "flos": 22091095843200.0, + "grad_norm": 2.009320451255184, + "language_loss": 0.84931207, + "learning_rate": 3.87022365156829e-06, + "loss": 0.87203121, + "num_input_tokens_seen": 25126890, + "step": 1180, + "time_per_iteration": 2.526172161102295 + }, + { + "auxiliary_loss_clip": 0.01142297, + "auxiliary_loss_mlp": 0.01039185, + "balance_loss_clip": 1.05483246, + "balance_loss_mlp": 1.0281527, + "epoch": 0.14200685384476644, + "flos": 24352390604160.0, + "grad_norm": 2.282580410868269, + "language_loss": 0.81009901, + "learning_rate": 3.869947478972123e-06, + "loss": 0.83191383, + "num_input_tokens_seen": 25147915, + "step": 1181, + "time_per_iteration": 2.83522629737854 + }, + { + "auxiliary_loss_clip": 0.01214349, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.0625701, + "balance_loss_mlp": 1.02524948, + "epoch": 0.14212709673540552, + "flos": 24022048199040.0, + "grad_norm": 2.262742897864024, + "language_loss": 0.82555723, + "learning_rate": 3.869671022706412e-06, + "loss": 0.84807116, + "num_input_tokens_seen": 25166645, + "step": 1182, + "time_per_iteration": 2.696279525756836 + }, + { + "auxiliary_loss_clip": 0.01158631, + "auxiliary_loss_mlp": 0.0104602, + "balance_loss_clip": 1.05308425, + "balance_loss_mlp": 1.03517818, + "epoch": 0.1422473396260446, + "flos": 26431797870720.0, + "grad_norm": 1.9557381545607537, + "language_loss": 0.64627188, + "learning_rate": 3.869394282813092e-06, + "loss": 0.66831839, + "num_input_tokens_seen": 25185845, + "step": 1183, + "time_per_iteration": 2.659381628036499 + }, + { + "auxiliary_loss_clip": 0.01193628, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.05697608, + "balance_loss_mlp": 1.03195632, + "epoch": 0.1423675825166837, + "flos": 17055306754560.0, + "grad_norm": 3.1121010254052033, + "language_loss": 0.89463937, + "learning_rate": 3.869117259334147e-06, + "loss": 0.91700864, + "num_input_tokens_seen": 25203770, + "step": 1184, + "time_per_iteration": 2.5750486850738525 + }, + { + "auxiliary_loss_clip": 0.01214635, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.0623312, + "balance_loss_mlp": 1.03145552, + "epoch": 0.1424878254073228, + "flos": 17929480049280.0, + "grad_norm": 1.853639622062382, + "language_loss": 0.81930923, + "learning_rate": 3.868839952311599e-06, + "loss": 0.84187794, + "num_input_tokens_seen": 25221725, + "step": 1185, + "time_per_iteration": 2.49806547164917 + }, + { + "auxiliary_loss_clip": 0.01200315, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.06298769, + "balance_loss_mlp": 1.02446318, + "epoch": 0.14260806829796188, + "flos": 20303606407680.0, + "grad_norm": 2.323956828573697, + "language_loss": 0.80340397, + "learning_rate": 3.868562361787516e-06, + "loss": 0.82576227, + "num_input_tokens_seen": 25240855, + "step": 1186, + "time_per_iteration": 2.5370519161224365 + }, + { + "auxiliary_loss_clip": 0.01136701, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.05215549, + "balance_loss_mlp": 1.02290595, + "epoch": 0.14272831118860096, + "flos": 23185724860800.0, + "grad_norm": 2.056503246063335, + "language_loss": 0.691715, + "learning_rate": 3.868284487804009e-06, + "loss": 0.71342123, + "num_input_tokens_seen": 25260085, + "step": 1187, + "time_per_iteration": 2.696873903274536 + }, + { + "auxiliary_loss_clip": 0.01208476, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_clip": 1.06049669, + "balance_loss_mlp": 1.03538513, + "epoch": 0.14284855407924008, + "flos": 27232210586880.0, + "grad_norm": 1.6143287209150559, + "language_loss": 0.78217274, + "learning_rate": 3.86800633040323e-06, + "loss": 0.80471802, + "num_input_tokens_seen": 25280675, + "step": 1188, + "time_per_iteration": 2.6943676471710205 + }, + { + "auxiliary_loss_clip": 0.01202893, + "auxiliary_loss_mlp": 0.00765847, + "balance_loss_clip": 1.06457949, + "balance_loss_mlp": 1.00075531, + "epoch": 0.14296879696987916, + "flos": 28184202696960.0, + "grad_norm": 2.001762600775245, + "language_loss": 0.78061759, + "learning_rate": 3.867727889627376e-06, + "loss": 0.80030501, + "num_input_tokens_seen": 25300290, + "step": 1189, + "time_per_iteration": 2.5867836475372314 + }, + { + "auxiliary_loss_clip": 0.01178168, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.05849648, + "balance_loss_mlp": 1.03204358, + "epoch": 0.14308903986051824, + "flos": 19390290266880.0, + "grad_norm": 2.1207731494882895, + "language_loss": 0.78223324, + "learning_rate": 3.867449165518687e-06, + "loss": 0.80445737, + "num_input_tokens_seen": 25316760, + "step": 1190, + "time_per_iteration": 2.5264713764190674 + }, + { + "auxiliary_loss_clip": 0.01238012, + "auxiliary_loss_mlp": 0.00766565, + "balance_loss_clip": 1.06639266, + "balance_loss_mlp": 1.00079107, + "epoch": 0.14320928275115732, + "flos": 17457506317440.0, + "grad_norm": 1.7759648261721543, + "language_loss": 0.71031684, + "learning_rate": 3.867170158119444e-06, + "loss": 0.73036253, + "num_input_tokens_seen": 25335760, + "step": 1191, + "time_per_iteration": 2.556504487991333 + }, + { + "auxiliary_loss_clip": 0.01238046, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.06650412, + "balance_loss_mlp": 1.02736795, + "epoch": 0.14332952564179643, + "flos": 21466070259840.0, + "grad_norm": 2.214404937392677, + "language_loss": 0.75551212, + "learning_rate": 3.866890867471972e-06, + "loss": 0.77827168, + "num_input_tokens_seen": 25354230, + "step": 1192, + "time_per_iteration": 2.5354533195495605 + }, + { + "auxiliary_loss_clip": 0.01198297, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_clip": 1.0549221, + "balance_loss_mlp": 1.03257012, + "epoch": 0.14344976853243552, + "flos": 16396992241920.0, + "grad_norm": 2.5540064589372715, + "language_loss": 0.89672583, + "learning_rate": 3.86661129361864e-06, + "loss": 0.91915309, + "num_input_tokens_seen": 25368720, + "step": 1193, + "time_per_iteration": 3.3329994678497314 + }, + { + "auxiliary_loss_clip": 0.01202651, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_clip": 1.06281376, + "balance_loss_mlp": 1.03503346, + "epoch": 0.1435700114230746, + "flos": 18916736336640.0, + "grad_norm": 1.9344724869248264, + "language_loss": 0.86119479, + "learning_rate": 3.866331436601859e-06, + "loss": 0.88368922, + "num_input_tokens_seen": 25386715, + "step": 1194, + "time_per_iteration": 2.5208323001861572 + }, + { + "auxiliary_loss_clip": 0.01235557, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.06596208, + "balance_loss_mlp": 1.03156674, + "epoch": 0.1436902543137137, + "flos": 19755394058880.0, + "grad_norm": 2.205488070809292, + "language_loss": 0.73481584, + "learning_rate": 3.866051296464083e-06, + "loss": 0.75760114, + "num_input_tokens_seen": 25405550, + "step": 1195, + "time_per_iteration": 2.485048770904541 + }, + { + "auxiliary_loss_clip": 0.01234827, + "auxiliary_loss_mlp": 0.00765843, + "balance_loss_clip": 1.06353295, + "balance_loss_mlp": 1.00076652, + "epoch": 0.1438104972043528, + "flos": 14684807669760.0, + "grad_norm": 2.8025494673110067, + "language_loss": 0.85218567, + "learning_rate": 3.86577087324781e-06, + "loss": 0.87219238, + "num_input_tokens_seen": 25422040, + "step": 1196, + "time_per_iteration": 2.424133539199829 + }, + { + "auxiliary_loss_clip": 0.01217736, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.06584454, + "balance_loss_mlp": 1.02664042, + "epoch": 0.14393074009499188, + "flos": 17092330698240.0, + "grad_norm": 1.9612024199306752, + "language_loss": 0.77624083, + "learning_rate": 3.865490166995578e-06, + "loss": 0.79879236, + "num_input_tokens_seen": 25440270, + "step": 1197, + "time_per_iteration": 2.529550313949585 + }, + { + "auxiliary_loss_clip": 0.01219585, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.06455755, + "balance_loss_mlp": 1.03010535, + "epoch": 0.144050982985631, + "flos": 30476200608000.0, + "grad_norm": 4.971996642695616, + "language_loss": 0.8433696, + "learning_rate": 3.86520917774997e-06, + "loss": 0.86598206, + "num_input_tokens_seen": 25459705, + "step": 1198, + "time_per_iteration": 2.6028761863708496 + }, + { + "auxiliary_loss_clip": 0.01213289, + "auxiliary_loss_mlp": 0.01044164, + "balance_loss_clip": 1.06324959, + "balance_loss_mlp": 1.03384066, + "epoch": 0.14417122587627007, + "flos": 17858484817920.0, + "grad_norm": 2.2202358873986574, + "language_loss": 0.74863768, + "learning_rate": 3.864927905553614e-06, + "loss": 0.77121222, + "num_input_tokens_seen": 25477615, + "step": 1199, + "time_per_iteration": 2.5557374954223633 + }, + { + "auxiliary_loss_clip": 0.01181641, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.05734086, + "balance_loss_mlp": 1.03181517, + "epoch": 0.14429146876690915, + "flos": 21613914639360.0, + "grad_norm": 1.6120325061678757, + "language_loss": 0.88999832, + "learning_rate": 3.8646463504491765e-06, + "loss": 0.91223723, + "num_input_tokens_seen": 25497750, + "step": 1200, + "time_per_iteration": 4.302061557769775 + }, + { + "auxiliary_loss_clip": 0.01222008, + "auxiliary_loss_mlp": 0.0104002, + "balance_loss_clip": 1.06677556, + "balance_loss_mlp": 1.02803946, + "epoch": 0.14441171165754824, + "flos": 23258120722560.0, + "grad_norm": 1.7643310954077014, + "language_loss": 0.8278898, + "learning_rate": 3.8643645124793705e-06, + "loss": 0.85051, + "num_input_tokens_seen": 25516650, + "step": 1201, + "time_per_iteration": 2.5275492668151855 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.06172729, + "balance_loss_mlp": 1.02488685, + "epoch": 0.14453195454818735, + "flos": 42854213963520.0, + "grad_norm": 1.5969281970308902, + "language_loss": 0.74687552, + "learning_rate": 3.8640823916869515e-06, + "loss": 0.7693851, + "num_input_tokens_seen": 25540960, + "step": 1202, + "time_per_iteration": 2.702115297317505 + }, + { + "auxiliary_loss_clip": 0.01233115, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.06433201, + "balance_loss_mlp": 1.02570486, + "epoch": 0.14465219743882643, + "flos": 27235873774080.0, + "grad_norm": 1.745997800890538, + "language_loss": 0.78473842, + "learning_rate": 3.863799988114714e-06, + "loss": 0.80743378, + "num_input_tokens_seen": 25562990, + "step": 1203, + "time_per_iteration": 2.5356040000915527 + }, + { + "auxiliary_loss_clip": 0.01239593, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.06682611, + "balance_loss_mlp": 1.02324939, + "epoch": 0.1447724403294655, + "flos": 16690705752960.0, + "grad_norm": 5.995993969021272, + "language_loss": 0.70618761, + "learning_rate": 3.863517301805502e-06, + "loss": 0.72893059, + "num_input_tokens_seen": 25581380, + "step": 1204, + "time_per_iteration": 2.4148178100585938 + }, + { + "auxiliary_loss_clip": 0.01190975, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.06332421, + "balance_loss_mlp": 1.02926266, + "epoch": 0.14489268322010462, + "flos": 20073741321600.0, + "grad_norm": 2.501096555932035, + "language_loss": 0.9669441, + "learning_rate": 3.863234332802196e-06, + "loss": 0.98925823, + "num_input_tokens_seen": 25593585, + "step": 1205, + "time_per_iteration": 2.5622525215148926 + }, + { + "auxiliary_loss_clip": 0.01197562, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.05821252, + "balance_loss_mlp": 1.0336864, + "epoch": 0.1450129261107437, + "flos": 27125627955840.0, + "grad_norm": 2.1030886228043117, + "language_loss": 0.74215102, + "learning_rate": 3.862951081147723e-06, + "loss": 0.76456577, + "num_input_tokens_seen": 25613750, + "step": 1206, + "time_per_iteration": 2.5627236366271973 + }, + { + "auxiliary_loss_clip": 0.01218229, + "auxiliary_loss_mlp": 0.01040788, + "balance_loss_clip": 1.06570017, + "balance_loss_mlp": 1.03087544, + "epoch": 0.1451331690013828, + "flos": 25702344472320.0, + "grad_norm": 2.3329062118639916, + "language_loss": 0.77881825, + "learning_rate": 3.862667546885053e-06, + "loss": 0.80140841, + "num_input_tokens_seen": 25632300, + "step": 1207, + "time_per_iteration": 2.5468392372131348 + }, + { + "auxiliary_loss_clip": 0.01207141, + "auxiliary_loss_mlp": 0.01042052, + "balance_loss_clip": 1.06144905, + "balance_loss_mlp": 1.03090012, + "epoch": 0.14525341189202187, + "flos": 25737393168000.0, + "grad_norm": 2.0588635540689166, + "language_loss": 0.73146099, + "learning_rate": 3.8623837300571965e-06, + "loss": 0.75395298, + "num_input_tokens_seen": 25651285, + "step": 1208, + "time_per_iteration": 2.5630273818969727 + }, + { + "auxiliary_loss_clip": 0.01236599, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.06619263, + "balance_loss_mlp": 1.02670598, + "epoch": 0.14537365478266098, + "flos": 23073898844160.0, + "grad_norm": 2.3296526383850877, + "language_loss": 0.84067225, + "learning_rate": 3.8620996307072085e-06, + "loss": 0.86341882, + "num_input_tokens_seen": 25671990, + "step": 1209, + "time_per_iteration": 2.474330425262451 + }, + { + "auxiliary_loss_clip": 0.01188648, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.05639768, + "balance_loss_mlp": 1.02694249, + "epoch": 0.14549389767330007, + "flos": 20595021448320.0, + "grad_norm": 1.8315405527030273, + "language_loss": 0.64536703, + "learning_rate": 3.861815248878188e-06, + "loss": 0.66763568, + "num_input_tokens_seen": 25689475, + "step": 1210, + "time_per_iteration": 2.593482255935669 + }, + { + "auxiliary_loss_clip": 0.0120048, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.06309366, + "balance_loss_mlp": 1.03321075, + "epoch": 0.14561414056393915, + "flos": 15121804533120.0, + "grad_norm": 5.521584986918201, + "language_loss": 0.79655075, + "learning_rate": 3.861530584613274e-06, + "loss": 0.81899112, + "num_input_tokens_seen": 25707475, + "step": 1211, + "time_per_iteration": 2.5235373973846436 + }, + { + "auxiliary_loss_clip": 0.01222332, + "auxiliary_loss_mlp": 0.00766027, + "balance_loss_clip": 1.06636417, + "balance_loss_mlp": 1.00107944, + "epoch": 0.14573438345457826, + "flos": 19427493778560.0, + "grad_norm": 2.199432454665693, + "language_loss": 0.82192487, + "learning_rate": 3.86124563795565e-06, + "loss": 0.84180844, + "num_input_tokens_seen": 25726290, + "step": 1212, + "time_per_iteration": 2.523937702178955 + }, + { + "auxiliary_loss_clip": 0.01233505, + "auxiliary_loss_mlp": 0.0103872, + "balance_loss_clip": 1.0668366, + "balance_loss_mlp": 1.02824187, + "epoch": 0.14585462634521734, + "flos": 24828422572800.0, + "grad_norm": 1.6804086105383762, + "language_loss": 0.70204425, + "learning_rate": 3.860960408948543e-06, + "loss": 0.72476649, + "num_input_tokens_seen": 25748040, + "step": 1213, + "time_per_iteration": 2.5231709480285645 + }, + { + "auxiliary_loss_clip": 0.01207665, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.06314898, + "balance_loss_mlp": 1.03201938, + "epoch": 0.14597486923585642, + "flos": 15448627405440.0, + "grad_norm": 2.4291383192565514, + "language_loss": 0.89876348, + "learning_rate": 3.860674897635222e-06, + "loss": 0.92126286, + "num_input_tokens_seen": 25764525, + "step": 1214, + "time_per_iteration": 2.5019190311431885 + }, + { + "auxiliary_loss_clip": 0.01218165, + "auxiliary_loss_mlp": 0.01048291, + "balance_loss_clip": 1.0659411, + "balance_loss_mlp": 1.03723979, + "epoch": 0.1460951121264955, + "flos": 16655154266880.0, + "grad_norm": 1.9940973093713716, + "language_loss": 0.83366507, + "learning_rate": 3.860389104058998e-06, + "loss": 0.85632968, + "num_input_tokens_seen": 25782755, + "step": 1215, + "time_per_iteration": 2.559870481491089 + }, + { + "auxiliary_loss_clip": 0.01201474, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.06231809, + "balance_loss_mlp": 1.02429473, + "epoch": 0.14621535501713462, + "flos": 24863291700480.0, + "grad_norm": 1.8641895390783614, + "language_loss": 0.72593713, + "learning_rate": 3.860103028263227e-06, + "loss": 0.74829912, + "num_input_tokens_seen": 25805860, + "step": 1216, + "time_per_iteration": 2.6286466121673584 + }, + { + "auxiliary_loss_clip": 0.0116329, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.05190015, + "balance_loss_mlp": 1.02422214, + "epoch": 0.1463355979077737, + "flos": 25228000442880.0, + "grad_norm": 2.333938901783611, + "language_loss": 0.70038521, + "learning_rate": 3.859816670291304e-06, + "loss": 0.7223686, + "num_input_tokens_seen": 25824955, + "step": 1217, + "time_per_iteration": 2.682464838027954 + }, + { + "auxiliary_loss_clip": 0.01150706, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.05657268, + "balance_loss_mlp": 1.02086031, + "epoch": 0.14645584079841278, + "flos": 22054143726720.0, + "grad_norm": 2.3336884630007955, + "language_loss": 0.8993752, + "learning_rate": 3.859530030186672e-06, + "loss": 0.92120707, + "num_input_tokens_seen": 25841965, + "step": 1218, + "time_per_iteration": 2.666663408279419 + }, + { + "auxiliary_loss_clip": 0.01207673, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.06523824, + "balance_loss_mlp": 1.02678597, + "epoch": 0.1465760836890519, + "flos": 23623870959360.0, + "grad_norm": 2.1699847963898073, + "language_loss": 0.8243829, + "learning_rate": 3.859243107992813e-06, + "loss": 0.84683496, + "num_input_tokens_seen": 25860770, + "step": 1219, + "time_per_iteration": 2.598076820373535 + }, + { + "auxiliary_loss_clip": 0.01188952, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.05639017, + "balance_loss_mlp": 1.02721632, + "epoch": 0.14669632657969098, + "flos": 37407893356800.0, + "grad_norm": 2.087799042459627, + "language_loss": 0.77893454, + "learning_rate": 3.858955903753252e-06, + "loss": 0.80121017, + "num_input_tokens_seen": 25879410, + "step": 1220, + "time_per_iteration": 2.7478127479553223 + }, + { + "auxiliary_loss_clip": 0.01216577, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.06097937, + "balance_loss_mlp": 1.02846062, + "epoch": 0.14681656947033006, + "flos": 28365910623360.0, + "grad_norm": 1.5556749032978585, + "language_loss": 0.83639264, + "learning_rate": 3.858668417511559e-06, + "loss": 0.85893798, + "num_input_tokens_seen": 25902160, + "step": 1221, + "time_per_iteration": 3.4299418926239014 + }, + { + "auxiliary_loss_clip": 0.01206603, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.0648303, + "balance_loss_mlp": 1.02134025, + "epoch": 0.14693681236096917, + "flos": 18479488078080.0, + "grad_norm": 2.1766633783015403, + "language_loss": 0.76281548, + "learning_rate": 3.8583806493113445e-06, + "loss": 0.78520465, + "num_input_tokens_seen": 25920505, + "step": 1222, + "time_per_iteration": 2.6012563705444336 + }, + { + "auxiliary_loss_clip": 0.01215196, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.06405354, + "balance_loss_mlp": 1.03082013, + "epoch": 0.14705705525160825, + "flos": 20777806782720.0, + "grad_norm": 2.2225291822672437, + "language_loss": 0.82385731, + "learning_rate": 3.858092599196263e-06, + "loss": 0.84641933, + "num_input_tokens_seen": 25938460, + "step": 1223, + "time_per_iteration": 2.4934475421905518 + }, + { + "auxiliary_loss_clip": 0.01218014, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.06439936, + "balance_loss_mlp": 1.02188921, + "epoch": 0.14717729814224734, + "flos": 29932944336000.0, + "grad_norm": 2.5930044814038324, + "language_loss": 0.82323581, + "learning_rate": 3.857804267210012e-06, + "loss": 0.84573829, + "num_input_tokens_seen": 25957760, + "step": 1224, + "time_per_iteration": 2.541569232940674 + }, + { + "auxiliary_loss_clip": 0.01170889, + "auxiliary_loss_mlp": 0.01043631, + "balance_loss_clip": 1.05348575, + "balance_loss_mlp": 1.03323007, + "epoch": 0.14729754103288642, + "flos": 20047491457920.0, + "grad_norm": 2.1280711188396375, + "language_loss": 0.88104141, + "learning_rate": 3.857515653396331e-06, + "loss": 0.90318668, + "num_input_tokens_seen": 25974970, + "step": 1225, + "time_per_iteration": 2.5456156730651855 + }, + { + "auxiliary_loss_clip": 0.01171425, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.05644774, + "balance_loss_mlp": 1.02866352, + "epoch": 0.14741778392352553, + "flos": 19281516906240.0, + "grad_norm": 2.3431463258731995, + "language_loss": 0.87043232, + "learning_rate": 3.857226757799002e-06, + "loss": 0.89253402, + "num_input_tokens_seen": 25992525, + "step": 1226, + "time_per_iteration": 3.3609275817871094 + }, + { + "auxiliary_loss_clip": 0.01199118, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.05969715, + "balance_loss_mlp": 1.02741694, + "epoch": 0.1475380268141646, + "flos": 25411108999680.0, + "grad_norm": 2.3536418791766964, + "language_loss": 0.74155647, + "learning_rate": 3.85693758046185e-06, + "loss": 0.76392758, + "num_input_tokens_seen": 26010815, + "step": 1227, + "time_per_iteration": 4.199660301208496 + }, + { + "auxiliary_loss_clip": 0.01232908, + "auxiliary_loss_mlp": 0.01041756, + "balance_loss_clip": 1.06786013, + "balance_loss_mlp": 1.03155172, + "epoch": 0.1476582697048037, + "flos": 20847652778880.0, + "grad_norm": 1.926404603545611, + "language_loss": 0.82754517, + "learning_rate": 3.8566481214287435e-06, + "loss": 0.85029173, + "num_input_tokens_seen": 26028935, + "step": 1228, + "time_per_iteration": 2.489389181137085 + }, + { + "auxiliary_loss_clip": 0.01177275, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.05549455, + "balance_loss_mlp": 1.0336647, + "epoch": 0.1477785125954428, + "flos": 14028109269120.0, + "grad_norm": 2.0631050130531796, + "language_loss": 0.90664279, + "learning_rate": 3.8563583807435935e-06, + "loss": 0.92885911, + "num_input_tokens_seen": 26045080, + "step": 1229, + "time_per_iteration": 2.5459182262420654 + }, + { + "auxiliary_loss_clip": 0.01220487, + "auxiliary_loss_mlp": 0.00765693, + "balance_loss_clip": 1.06373549, + "balance_loss_mlp": 1.00118995, + "epoch": 0.1478987554860819, + "flos": 20516699842560.0, + "grad_norm": 2.151583407052703, + "language_loss": 0.77519083, + "learning_rate": 3.856068358450353e-06, + "loss": 0.79505265, + "num_input_tokens_seen": 26065030, + "step": 1230, + "time_per_iteration": 2.523956298828125 + }, + { + "auxiliary_loss_clip": 0.01199544, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.06643867, + "balance_loss_mlp": 1.031358, + "epoch": 0.14801899837672097, + "flos": 17857012360320.0, + "grad_norm": 1.7727411085181164, + "language_loss": 0.855335, + "learning_rate": 3.8557780545930186e-06, + "loss": 0.87774724, + "num_input_tokens_seen": 26083445, + "step": 1231, + "time_per_iteration": 2.5099692344665527 + }, + { + "auxiliary_loss_clip": 0.01201041, + "auxiliary_loss_mlp": 0.0104045, + "balance_loss_clip": 1.06306303, + "balance_loss_mlp": 1.03039432, + "epoch": 0.14813924126736006, + "flos": 20881408584960.0, + "grad_norm": 1.7840170347637485, + "language_loss": 0.7953316, + "learning_rate": 3.855487469215628e-06, + "loss": 0.81774652, + "num_input_tokens_seen": 26102375, + "step": 1232, + "time_per_iteration": 2.579627752304077 + }, + { + "auxiliary_loss_clip": 0.01186958, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.06098998, + "balance_loss_mlp": 1.02443576, + "epoch": 0.14825948415799917, + "flos": 37414070496000.0, + "grad_norm": 2.1075343167426537, + "language_loss": 0.72536087, + "learning_rate": 3.855196602362264e-06, + "loss": 0.74757934, + "num_input_tokens_seen": 26125295, + "step": 1233, + "time_per_iteration": 2.8180525302886963 + }, + { + "auxiliary_loss_clip": 0.01214793, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.06178856, + "balance_loss_mlp": 1.02136278, + "epoch": 0.14837972704863825, + "flos": 22014641744640.0, + "grad_norm": 1.9838775211896933, + "language_loss": 0.94167888, + "learning_rate": 3.854905454077051e-06, + "loss": 0.9641459, + "num_input_tokens_seen": 26142905, + "step": 1234, + "time_per_iteration": 2.5686495304107666 + }, + { + "auxiliary_loss_clip": 0.01137481, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.05279207, + "balance_loss_mlp": 1.02734292, + "epoch": 0.14849996993927733, + "flos": 20996323171200.0, + "grad_norm": 2.2573491934859526, + "language_loss": 0.88214552, + "learning_rate": 3.854614024404155e-06, + "loss": 0.90389562, + "num_input_tokens_seen": 26161215, + "step": 1235, + "time_per_iteration": 2.7196803092956543 + }, + { + "auxiliary_loss_clip": 0.01187531, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.05716324, + "balance_loss_mlp": 1.02280831, + "epoch": 0.14862021282991644, + "flos": 20047994248320.0, + "grad_norm": 1.9089457635837919, + "language_loss": 0.89282167, + "learning_rate": 3.8543223133877865e-06, + "loss": 0.91502953, + "num_input_tokens_seen": 26179810, + "step": 1236, + "time_per_iteration": 2.5940613746643066 + }, + { + "auxiliary_loss_clip": 0.01183638, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.05688262, + "balance_loss_mlp": 1.02977979, + "epoch": 0.14874045572055553, + "flos": 22712027276160.0, + "grad_norm": 1.8445128142860934, + "language_loss": 0.88202894, + "learning_rate": 3.854030321072198e-06, + "loss": 0.90428585, + "num_input_tokens_seen": 26199715, + "step": 1237, + "time_per_iteration": 2.8678267002105713 + }, + { + "auxiliary_loss_clip": 0.01191781, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.06005001, + "balance_loss_mlp": 1.01998234, + "epoch": 0.1488606986111946, + "flos": 25411288567680.0, + "grad_norm": 1.9483647369410164, + "language_loss": 0.73584008, + "learning_rate": 3.853738047501682e-06, + "loss": 0.75805926, + "num_input_tokens_seen": 26220275, + "step": 1238, + "time_per_iteration": 2.7049202919006348 + }, + { + "auxiliary_loss_clip": 0.01218715, + "auxiliary_loss_mlp": 0.01039142, + "balance_loss_clip": 1.06586349, + "balance_loss_mlp": 1.02797818, + "epoch": 0.1489809415018337, + "flos": 17018749687680.0, + "grad_norm": 2.075972479156352, + "language_loss": 0.77499306, + "learning_rate": 3.85344549272058e-06, + "loss": 0.7975716, + "num_input_tokens_seen": 26238255, + "step": 1239, + "time_per_iteration": 2.5351147651672363 + }, + { + "auxiliary_loss_clip": 0.01212706, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.06142783, + "balance_loss_mlp": 1.02824092, + "epoch": 0.1491011843924728, + "flos": 33659394860160.0, + "grad_norm": 1.7969874009261801, + "language_loss": 0.82827985, + "learning_rate": 3.853152656773269e-06, + "loss": 0.8507992, + "num_input_tokens_seen": 26259690, + "step": 1240, + "time_per_iteration": 2.667109966278076 + }, + { + "auxiliary_loss_clip": 0.01198511, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.06150651, + "balance_loss_mlp": 1.02347493, + "epoch": 0.14922142728311188, + "flos": 21179000764800.0, + "grad_norm": 1.9606264955825021, + "language_loss": 0.84800541, + "learning_rate": 3.852859539704174e-06, + "loss": 0.87033212, + "num_input_tokens_seen": 26278990, + "step": 1241, + "time_per_iteration": 2.6013031005859375 + }, + { + "auxiliary_loss_clip": 0.01166085, + "auxiliary_loss_mlp": 0.01040563, + "balance_loss_clip": 1.05418491, + "balance_loss_mlp": 1.02991736, + "epoch": 0.14934167017375097, + "flos": 29860548474240.0, + "grad_norm": 1.894130687108162, + "language_loss": 0.76595157, + "learning_rate": 3.85256614155776e-06, + "loss": 0.78801805, + "num_input_tokens_seen": 26299120, + "step": 1242, + "time_per_iteration": 2.697136878967285 + }, + { + "auxiliary_loss_clip": 0.01212995, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.05970931, + "balance_loss_mlp": 1.024912, + "epoch": 0.14946191306439008, + "flos": 17019216564480.0, + "grad_norm": 1.9377910472175266, + "language_loss": 0.74569196, + "learning_rate": 3.852272462378535e-06, + "loss": 0.76817536, + "num_input_tokens_seen": 26316995, + "step": 1243, + "time_per_iteration": 2.5294992923736572 + }, + { + "auxiliary_loss_clip": 0.0119959, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.06105196, + "balance_loss_mlp": 1.03357601, + "epoch": 0.14958215595502916, + "flos": 15669047214720.0, + "grad_norm": 1.9167691044361883, + "language_loss": 0.77718198, + "learning_rate": 3.85197850221105e-06, + "loss": 0.79961348, + "num_input_tokens_seen": 26333295, + "step": 1244, + "time_per_iteration": 2.5356898307800293 + }, + { + "auxiliary_loss_clip": 0.01217267, + "auxiliary_loss_mlp": 0.01038893, + "balance_loss_clip": 1.06765032, + "balance_loss_mlp": 1.02871215, + "epoch": 0.14970239884566824, + "flos": 33108560818560.0, + "grad_norm": 1.846309031505951, + "language_loss": 0.76009542, + "learning_rate": 3.851684261099899e-06, + "loss": 0.78265703, + "num_input_tokens_seen": 26355035, + "step": 1245, + "time_per_iteration": 2.6334850788116455 + }, + { + "auxiliary_loss_clip": 0.01195769, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.05859756, + "balance_loss_mlp": 1.0236876, + "epoch": 0.14982264173630733, + "flos": 17821245392640.0, + "grad_norm": 1.9784966353138738, + "language_loss": 0.86618328, + "learning_rate": 3.851389739089718e-06, + "loss": 0.88849628, + "num_input_tokens_seen": 26371655, + "step": 1246, + "time_per_iteration": 2.4992992877960205 + }, + { + "auxiliary_loss_clip": 0.01222927, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.06952918, + "balance_loss_mlp": 1.02729273, + "epoch": 0.14994288462694644, + "flos": 32409559175040.0, + "grad_norm": 1.8646401595558197, + "language_loss": 0.80390334, + "learning_rate": 3.851094936225186e-06, + "loss": 0.82651091, + "num_input_tokens_seen": 26392540, + "step": 1247, + "time_per_iteration": 3.3245184421539307 + }, + { + "auxiliary_loss_clip": 0.01198179, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.06495309, + "balance_loss_mlp": 1.02032173, + "epoch": 0.15006312751758552, + "flos": 31794661226880.0, + "grad_norm": 1.4361056906965086, + "language_loss": 0.76639128, + "learning_rate": 3.850799852551024e-06, + "loss": 0.78867853, + "num_input_tokens_seen": 26414960, + "step": 1248, + "time_per_iteration": 2.604133129119873 + }, + { + "auxiliary_loss_clip": 0.01208518, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_clip": 1.06055403, + "balance_loss_mlp": 1.03155136, + "epoch": 0.1501833704082246, + "flos": 16618022582400.0, + "grad_norm": 2.3968315918848697, + "language_loss": 0.86450148, + "learning_rate": 3.850504488111995e-06, + "loss": 0.88701487, + "num_input_tokens_seen": 26431635, + "step": 1249, + "time_per_iteration": 2.4781830310821533 + }, + { + "auxiliary_loss_clip": 0.01191947, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.05761659, + "balance_loss_mlp": 1.02242303, + "epoch": 0.15030361329886371, + "flos": 23471178243840.0, + "grad_norm": 2.1618187401357, + "language_loss": 0.82751626, + "learning_rate": 3.850208842952907e-06, + "loss": 0.84975946, + "num_input_tokens_seen": 26450440, + "step": 1250, + "time_per_iteration": 2.5932819843292236 + }, + { + "auxiliary_loss_clip": 0.01177491, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.05656052, + "balance_loss_mlp": 1.03056002, + "epoch": 0.1504238561895028, + "flos": 25629409906560.0, + "grad_norm": 1.9155241691215383, + "language_loss": 0.79397964, + "learning_rate": 3.849912917118608e-06, + "loss": 0.81616724, + "num_input_tokens_seen": 26471480, + "step": 1251, + "time_per_iteration": 2.6426584720611572 + }, + { + "auxiliary_loss_clip": 0.01128717, + "auxiliary_loss_mlp": 0.01010275, + "balance_loss_clip": 1.03730643, + "balance_loss_mlp": 1.00724685, + "epoch": 0.15054409908014188, + "flos": 52095146129280.0, + "grad_norm": 0.8852069280610807, + "language_loss": 0.59294569, + "learning_rate": 3.849616710653992e-06, + "loss": 0.61433554, + "num_input_tokens_seen": 26532950, + "step": 1252, + "time_per_iteration": 3.051146984100342 + }, + { + "auxiliary_loss_clip": 0.01213775, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.06332695, + "balance_loss_mlp": 1.02819586, + "epoch": 0.150664341970781, + "flos": 18880251096960.0, + "grad_norm": 1.8306079649858018, + "language_loss": 0.75011694, + "learning_rate": 3.84932022360399e-06, + "loss": 0.77264643, + "num_input_tokens_seen": 26551615, + "step": 1253, + "time_per_iteration": 3.309061288833618 + }, + { + "auxiliary_loss_clip": 0.0120186, + "auxiliary_loss_mlp": 0.01047958, + "balance_loss_clip": 1.06609726, + "balance_loss_mlp": 1.03640103, + "epoch": 0.15078458486142007, + "flos": 22163240309760.0, + "grad_norm": 3.1467694429227144, + "language_loss": 0.84721869, + "learning_rate": 3.849023456013581e-06, + "loss": 0.86971688, + "num_input_tokens_seen": 26569175, + "step": 1254, + "time_per_iteration": 3.4654369354248047 + }, + { + "auxiliary_loss_clip": 0.01223025, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.06529975, + "balance_loss_mlp": 1.02985799, + "epoch": 0.15090482775205916, + "flos": 26651894457600.0, + "grad_norm": 2.206111092908473, + "language_loss": 0.62324739, + "learning_rate": 3.848726407927784e-06, + "loss": 0.64588642, + "num_input_tokens_seen": 26589560, + "step": 1255, + "time_per_iteration": 2.5752763748168945 + }, + { + "auxiliary_loss_clip": 0.01204494, + "auxiliary_loss_mlp": 0.01040541, + "balance_loss_clip": 1.06490159, + "balance_loss_mlp": 1.03045022, + "epoch": 0.15102507064269824, + "flos": 21798998444160.0, + "grad_norm": 2.455917277957209, + "language_loss": 0.86136806, + "learning_rate": 3.84842907939166e-06, + "loss": 0.88381845, + "num_input_tokens_seen": 26608785, + "step": 1256, + "time_per_iteration": 2.579418182373047 + }, + { + "auxiliary_loss_clip": 0.01179641, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_clip": 1.05955935, + "balance_loss_mlp": 1.03450906, + "epoch": 0.15114531353333735, + "flos": 22820908377600.0, + "grad_norm": 2.8060968145051794, + "language_loss": 0.71663916, + "learning_rate": 3.8481314704503146e-06, + "loss": 0.73888457, + "num_input_tokens_seen": 26628615, + "step": 1257, + "time_per_iteration": 2.5872514247894287 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01040407, + "balance_loss_clip": 1.0691011, + "balance_loss_mlp": 1.03022075, + "epoch": 0.15126555642397643, + "flos": 19682674974720.0, + "grad_norm": 2.0882302997137914, + "language_loss": 0.87752086, + "learning_rate": 3.847833581148895e-06, + "loss": 0.90012306, + "num_input_tokens_seen": 26647525, + "step": 1258, + "time_per_iteration": 2.5161356925964355 + }, + { + "auxiliary_loss_clip": 0.01230845, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.06317234, + "balance_loss_mlp": 1.02459311, + "epoch": 0.15138579931461552, + "flos": 28726022424960.0, + "grad_norm": 5.407241101664347, + "language_loss": 0.81182969, + "learning_rate": 3.84753541153259e-06, + "loss": 0.83449572, + "num_input_tokens_seen": 26667095, + "step": 1259, + "time_per_iteration": 2.545649766921997 + }, + { + "auxiliary_loss_clip": 0.01218021, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.06639695, + "balance_loss_mlp": 1.02786338, + "epoch": 0.15150604220525463, + "flos": 22127006465280.0, + "grad_norm": 1.5734462770400977, + "language_loss": 0.83185923, + "learning_rate": 3.847236961646633e-06, + "loss": 0.85441858, + "num_input_tokens_seen": 26686075, + "step": 1260, + "time_per_iteration": 2.5548789501190186 + }, + { + "auxiliary_loss_clip": 0.01194536, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.05972373, + "balance_loss_mlp": 1.03080893, + "epoch": 0.1516262850958937, + "flos": 12968708515200.0, + "grad_norm": 2.144056087054954, + "language_loss": 0.77974689, + "learning_rate": 3.846938231536296e-06, + "loss": 0.80211365, + "num_input_tokens_seen": 26701695, + "step": 1261, + "time_per_iteration": 2.5173611640930176 + }, + { + "auxiliary_loss_clip": 0.01222641, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.06780601, + "balance_loss_mlp": 1.02248275, + "epoch": 0.1517465279865328, + "flos": 21797130936960.0, + "grad_norm": 1.9857101710018616, + "language_loss": 0.81057531, + "learning_rate": 3.8466392212468995e-06, + "loss": 0.8331275, + "num_input_tokens_seen": 26721885, + "step": 1262, + "time_per_iteration": 2.545497179031372 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01007892, + "balance_loss_clip": 1.02883697, + "balance_loss_mlp": 1.00476837, + "epoch": 0.15186677087717187, + "flos": 58174569901440.0, + "grad_norm": 0.82025512844847, + "language_loss": 0.61884898, + "learning_rate": 3.8463399308238e-06, + "loss": 0.63994879, + "num_input_tokens_seen": 26780990, + "step": 1263, + "time_per_iteration": 3.1565325260162354 + }, + { + "auxiliary_loss_clip": 0.01216945, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.06655836, + "balance_loss_mlp": 1.02850199, + "epoch": 0.15198701376781099, + "flos": 32669696448000.0, + "grad_norm": 1.950180375412627, + "language_loss": 0.63794374, + "learning_rate": 3.846040360312402e-06, + "loss": 0.66051811, + "num_input_tokens_seen": 26804250, + "step": 1264, + "time_per_iteration": 2.6542649269104004 + }, + { + "auxiliary_loss_clip": 0.01232222, + "auxiliary_loss_mlp": 0.01042431, + "balance_loss_clip": 1.06537294, + "balance_loss_mlp": 1.03154683, + "epoch": 0.15210725665845007, + "flos": 28402575431040.0, + "grad_norm": 5.505775451890121, + "language_loss": 0.81541437, + "learning_rate": 3.8457405097581485e-06, + "loss": 0.83816087, + "num_input_tokens_seen": 26823240, + "step": 1265, + "time_per_iteration": 2.6206254959106445 + }, + { + "auxiliary_loss_clip": 0.01172616, + "auxiliary_loss_mlp": 0.01040508, + "balance_loss_clip": 1.05505085, + "balance_loss_mlp": 1.0293622, + "epoch": 0.15222749954908915, + "flos": 19938179393280.0, + "grad_norm": 2.0813915623907375, + "language_loss": 0.77849263, + "learning_rate": 3.8454403792065275e-06, + "loss": 0.80062389, + "num_input_tokens_seen": 26842060, + "step": 1266, + "time_per_iteration": 2.6828672885894775 + }, + { + "auxiliary_loss_clip": 0.01175289, + "auxiliary_loss_mlp": 0.01048096, + "balance_loss_clip": 1.05693722, + "balance_loss_mlp": 1.03721261, + "epoch": 0.15234774243972826, + "flos": 21324223451520.0, + "grad_norm": 1.9520953429284653, + "language_loss": 0.85729051, + "learning_rate": 3.845139968703068e-06, + "loss": 0.87952435, + "num_input_tokens_seen": 26859580, + "step": 1267, + "time_per_iteration": 2.5823848247528076 + }, + { + "auxiliary_loss_clip": 0.01169603, + "auxiliary_loss_mlp": 0.01044353, + "balance_loss_clip": 1.0563761, + "balance_loss_mlp": 1.03303993, + "epoch": 0.15246798533036734, + "flos": 25957812977280.0, + "grad_norm": 1.9090533275148713, + "language_loss": 0.82801747, + "learning_rate": 3.844839278293342e-06, + "loss": 0.85015702, + "num_input_tokens_seen": 26880430, + "step": 1268, + "time_per_iteration": 2.7268917560577393 + }, + { + "auxiliary_loss_clip": 0.01236778, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.0693264, + "balance_loss_mlp": 1.02874529, + "epoch": 0.15258822822100643, + "flos": 25811907932160.0, + "grad_norm": 2.8893714409464484, + "language_loss": 0.7668348, + "learning_rate": 3.8445383080229654e-06, + "loss": 0.78960019, + "num_input_tokens_seen": 26896445, + "step": 1269, + "time_per_iteration": 2.538482189178467 + }, + { + "auxiliary_loss_clip": 0.01194403, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.05814433, + "balance_loss_mlp": 1.02336049, + "epoch": 0.1527084711116455, + "flos": 25265455349760.0, + "grad_norm": 2.1131678030585768, + "language_loss": 0.7376042, + "learning_rate": 3.844237057937593e-06, + "loss": 0.75989389, + "num_input_tokens_seen": 26915450, + "step": 1270, + "time_per_iteration": 2.585883378982544 + }, + { + "auxiliary_loss_clip": 0.0122409, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.06435156, + "balance_loss_mlp": 1.02497745, + "epoch": 0.15282871400228462, + "flos": 29240227572480.0, + "grad_norm": 2.400238039857949, + "language_loss": 0.77860111, + "learning_rate": 3.843935528082926e-06, + "loss": 0.80120587, + "num_input_tokens_seen": 26936475, + "step": 1271, + "time_per_iteration": 2.593721628189087 + }, + { + "auxiliary_loss_clip": 0.01218084, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.06450868, + "balance_loss_mlp": 1.0230782, + "epoch": 0.1529489568929237, + "flos": 20882952869760.0, + "grad_norm": 1.8662503511665753, + "language_loss": 0.8498466, + "learning_rate": 3.843633718504704e-06, + "loss": 0.87236059, + "num_input_tokens_seen": 26954920, + "step": 1272, + "time_per_iteration": 2.5233373641967773 + }, + { + "auxiliary_loss_clip": 0.01184753, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.05964136, + "balance_loss_mlp": 1.0268724, + "epoch": 0.1530691997835628, + "flos": 20083833043200.0, + "grad_norm": 2.500035629287759, + "language_loss": 0.90378654, + "learning_rate": 3.843331629248715e-06, + "loss": 0.92600679, + "num_input_tokens_seen": 26972520, + "step": 1273, + "time_per_iteration": 2.6000401973724365 + }, + { + "auxiliary_loss_clip": 0.01234526, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.06784344, + "balance_loss_mlp": 1.02362239, + "epoch": 0.1531894426742019, + "flos": 28759814144640.0, + "grad_norm": 2.3991455516462037, + "language_loss": 0.76452708, + "learning_rate": 3.843029260360782e-06, + "loss": 0.78721005, + "num_input_tokens_seen": 26990890, + "step": 1274, + "time_per_iteration": 3.3469324111938477 + }, + { + "auxiliary_loss_clip": 0.01218224, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_clip": 1.06559873, + "balance_loss_mlp": 1.03507292, + "epoch": 0.15330968556484098, + "flos": 22236282616320.0, + "grad_norm": 2.5379867313463187, + "language_loss": 0.79083681, + "learning_rate": 3.8427266118867755e-06, + "loss": 0.81347132, + "num_input_tokens_seen": 27010640, + "step": 1275, + "time_per_iteration": 2.564117908477783 + }, + { + "auxiliary_loss_clip": 0.01201257, + "auxiliary_loss_mlp": 0.01036847, + "balance_loss_clip": 1.06319606, + "balance_loss_mlp": 1.02596307, + "epoch": 0.15342992845548006, + "flos": 27527504296320.0, + "grad_norm": 2.1819681684979897, + "language_loss": 0.82657218, + "learning_rate": 3.842423683872608e-06, + "loss": 0.84895325, + "num_input_tokens_seen": 27031215, + "step": 1276, + "time_per_iteration": 2.5950303077697754 + }, + { + "auxiliary_loss_clip": 0.01216163, + "auxiliary_loss_mlp": 0.01043493, + "balance_loss_clip": 1.0629673, + "balance_loss_mlp": 1.03274584, + "epoch": 0.15355017134611917, + "flos": 19609596754560.0, + "grad_norm": 2.445758739801298, + "language_loss": 0.77784663, + "learning_rate": 3.842120476364232e-06, + "loss": 0.80044317, + "num_input_tokens_seen": 27049665, + "step": 1277, + "time_per_iteration": 2.517185688018799 + }, + { + "auxiliary_loss_clip": 0.01222064, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.06284201, + "balance_loss_mlp": 1.02114904, + "epoch": 0.15367041423675826, + "flos": 18478590238080.0, + "grad_norm": 2.086664832911439, + "language_loss": 0.83740085, + "learning_rate": 3.841816989407644e-06, + "loss": 0.85993969, + "num_input_tokens_seen": 27065155, + "step": 1278, + "time_per_iteration": 2.4948816299438477 + }, + { + "auxiliary_loss_clip": 0.0118492, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.06155968, + "balance_loss_mlp": 1.03506184, + "epoch": 0.15379065712739734, + "flos": 41427662342400.0, + "grad_norm": 1.9984647715335626, + "language_loss": 0.7709744, + "learning_rate": 3.841513223048884e-06, + "loss": 0.79328394, + "num_input_tokens_seen": 27085840, + "step": 1279, + "time_per_iteration": 2.7593600749969482 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.05660594, + "balance_loss_mlp": 1.02737617, + "epoch": 0.15391090001803642, + "flos": 22054215553920.0, + "grad_norm": 2.4294632891232535, + "language_loss": 0.78472275, + "learning_rate": 3.841209177334031e-06, + "loss": 0.80692917, + "num_input_tokens_seen": 27104200, + "step": 1280, + "time_per_iteration": 3.4639816284179688 + }, + { + "auxiliary_loss_clip": 0.01213993, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.06417727, + "balance_loss_mlp": 1.02682781, + "epoch": 0.15403114290867553, + "flos": 15450351258240.0, + "grad_norm": 2.066657805289406, + "language_loss": 0.74962389, + "learning_rate": 3.84090485230921e-06, + "loss": 0.7721377, + "num_input_tokens_seen": 27122440, + "step": 1281, + "time_per_iteration": 3.363729238510132 + }, + { + "auxiliary_loss_clip": 0.01233154, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.06654787, + "balance_loss_mlp": 1.02524614, + "epoch": 0.15415138579931462, + "flos": 17929156826880.0, + "grad_norm": 3.197919723147775, + "language_loss": 0.76139849, + "learning_rate": 3.840600248020588e-06, + "loss": 0.78409076, + "num_input_tokens_seen": 27139380, + "step": 1282, + "time_per_iteration": 2.443704843521118 + }, + { + "auxiliary_loss_clip": 0.01206283, + "auxiliary_loss_mlp": 0.01047181, + "balance_loss_clip": 1.05950594, + "balance_loss_mlp": 1.0359571, + "epoch": 0.1542716286899537, + "flos": 11429325296640.0, + "grad_norm": 2.218352223811508, + "language_loss": 0.79768419, + "learning_rate": 3.840295364514371e-06, + "loss": 0.82021886, + "num_input_tokens_seen": 27156760, + "step": 1283, + "time_per_iteration": 2.545259952545166 + }, + { + "auxiliary_loss_clip": 0.01202859, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.06284356, + "balance_loss_mlp": 1.02999425, + "epoch": 0.1543918715805928, + "flos": 17420338719360.0, + "grad_norm": 2.3094999921301653, + "language_loss": 0.7854867, + "learning_rate": 3.83999020183681e-06, + "loss": 0.80792642, + "num_input_tokens_seen": 27175455, + "step": 1284, + "time_per_iteration": 2.6204116344451904 + }, + { + "auxiliary_loss_clip": 0.01146617, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.05383277, + "balance_loss_mlp": 1.03269887, + "epoch": 0.1545121144712319, + "flos": 17786376264960.0, + "grad_norm": 2.061416944553707, + "language_loss": 0.78716886, + "learning_rate": 3.839684760034199e-06, + "loss": 0.80906522, + "num_input_tokens_seen": 27193660, + "step": 1285, + "time_per_iteration": 2.7224276065826416 + }, + { + "auxiliary_loss_clip": 0.01181168, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.05995119, + "balance_loss_mlp": 1.02752066, + "epoch": 0.15463235736187098, + "flos": 28220185146240.0, + "grad_norm": 2.1749821451854863, + "language_loss": 0.65758193, + "learning_rate": 3.8393790391528716e-06, + "loss": 0.67978358, + "num_input_tokens_seen": 27214355, + "step": 1286, + "time_per_iteration": 2.6035068035125732 + }, + { + "auxiliary_loss_clip": 0.01198753, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.05927157, + "balance_loss_mlp": 1.02650094, + "epoch": 0.15475260025251006, + "flos": 22856890826880.0, + "grad_norm": 2.0392908280263033, + "language_loss": 0.89314538, + "learning_rate": 3.8390730392392075e-06, + "loss": 0.9155032, + "num_input_tokens_seen": 27234335, + "step": 1287, + "time_per_iteration": 2.540876626968384 + }, + { + "auxiliary_loss_clip": 0.01234376, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.02630162, + "epoch": 0.15487284314314917, + "flos": 17602872658560.0, + "grad_norm": 3.9021632639204054, + "language_loss": 0.79294878, + "learning_rate": 3.838766760339626e-06, + "loss": 0.81565845, + "num_input_tokens_seen": 27252860, + "step": 1288, + "time_per_iteration": 2.4512548446655273 + }, + { + "auxiliary_loss_clip": 0.01166882, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.05561733, + "balance_loss_mlp": 1.02815199, + "epoch": 0.15499308603378825, + "flos": 20082037363200.0, + "grad_norm": 2.477251727838228, + "language_loss": 0.79087478, + "learning_rate": 3.838460202500587e-06, + "loss": 0.8129375, + "num_input_tokens_seen": 27268650, + "step": 1289, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01181334, + "auxiliary_loss_mlp": 0.01034357, + "balance_loss_clip": 1.06307626, + "balance_loss_mlp": 1.02257359, + "epoch": 0.15511332892442733, + "flos": 15918051271680.0, + "grad_norm": 2.5256556364037914, + "language_loss": 0.74148095, + "learning_rate": 3.838153365768599e-06, + "loss": 0.76363784, + "num_input_tokens_seen": 27285160, + "step": 1290, + "time_per_iteration": 2.6385819911956787 + }, + { + "auxiliary_loss_clip": 0.01185779, + "auxiliary_loss_mlp": 0.01050897, + "balance_loss_clip": 1.0661453, + "balance_loss_mlp": 1.03948307, + "epoch": 0.15523357181506645, + "flos": 41282475569280.0, + "grad_norm": 2.4787875437808387, + "language_loss": 0.7563808, + "learning_rate": 3.837846250190206e-06, + "loss": 0.77874762, + "num_input_tokens_seen": 27308025, + "step": 1291, + "time_per_iteration": 2.7535786628723145 + }, + { + "auxiliary_loss_clip": 0.01164503, + "auxiliary_loss_mlp": 0.0076618, + "balance_loss_clip": 1.05694509, + "balance_loss_mlp": 1.00158119, + "epoch": 0.15535381470570553, + "flos": 18478769806080.0, + "grad_norm": 1.9988032328664185, + "language_loss": 0.76965094, + "learning_rate": 3.837538855811998e-06, + "loss": 0.78895772, + "num_input_tokens_seen": 27326200, + "step": 1292, + "time_per_iteration": 2.5844223499298096 + }, + { + "auxiliary_loss_clip": 0.01209864, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.06541908, + "balance_loss_mlp": 1.03286862, + "epoch": 0.1554740575963446, + "flos": 13918150759680.0, + "grad_norm": 2.072007059468015, + "language_loss": 0.71081936, + "learning_rate": 3.837231182680606e-06, + "loss": 0.73335069, + "num_input_tokens_seen": 27344165, + "step": 1293, + "time_per_iteration": 2.5352165699005127 + }, + { + "auxiliary_loss_clip": 0.01225103, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.06754553, + "balance_loss_mlp": 1.02632999, + "epoch": 0.1555943004869837, + "flos": 20847078161280.0, + "grad_norm": 1.6758742727645968, + "language_loss": 0.75904238, + "learning_rate": 3.836923230842706e-06, + "loss": 0.78166747, + "num_input_tokens_seen": 27363280, + "step": 1294, + "time_per_iteration": 2.5206072330474854 + }, + { + "auxiliary_loss_clip": 0.01171255, + "auxiliary_loss_mlp": 0.01040005, + "balance_loss_clip": 1.05404019, + "balance_loss_mlp": 1.02873349, + "epoch": 0.1557145433776228, + "flos": 22085888371200.0, + "grad_norm": 1.8952306131757557, + "language_loss": 0.81068122, + "learning_rate": 3.836615000345011e-06, + "loss": 0.83279383, + "num_input_tokens_seen": 27381460, + "step": 1295, + "time_per_iteration": 2.6306960582733154 + }, + { + "auxiliary_loss_clip": 0.01229092, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.06455302, + "balance_loss_mlp": 1.02669811, + "epoch": 0.1558347862682619, + "flos": 19791987039360.0, + "grad_norm": 1.9693740919876468, + "language_loss": 0.77962816, + "learning_rate": 3.836306491234282e-06, + "loss": 0.8022846, + "num_input_tokens_seen": 27399310, + "step": 1296, + "time_per_iteration": 2.4918596744537354 + }, + { + "auxiliary_loss_clip": 0.01197237, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.06636786, + "balance_loss_mlp": 1.03065872, + "epoch": 0.15595502915890097, + "flos": 17237086508160.0, + "grad_norm": 2.256355953925597, + "language_loss": 0.75914252, + "learning_rate": 3.835997703557317e-06, + "loss": 0.78151906, + "num_input_tokens_seen": 27416050, + "step": 1297, + "time_per_iteration": 2.54742431640625 + }, + { + "auxiliary_loss_clip": 0.01169121, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.05299473, + "balance_loss_mlp": 1.02845311, + "epoch": 0.15607527204954008, + "flos": 19719519350400.0, + "grad_norm": 1.645801302749267, + "language_loss": 0.80161953, + "learning_rate": 3.83568863736096e-06, + "loss": 0.82369852, + "num_input_tokens_seen": 27434920, + "step": 1298, + "time_per_iteration": 2.6482622623443604 + }, + { + "auxiliary_loss_clip": 0.01187663, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.05865836, + "balance_loss_mlp": 1.02741694, + "epoch": 0.15619551494017916, + "flos": 18515650095360.0, + "grad_norm": 2.8277417804540192, + "language_loss": 0.89143074, + "learning_rate": 3.8353792926920975e-06, + "loss": 0.91368371, + "num_input_tokens_seen": 27453570, + "step": 1299, + "time_per_iteration": 2.560147285461426 + }, + { + "auxiliary_loss_clip": 0.01225637, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.0686295, + "balance_loss_mlp": 1.03401339, + "epoch": 0.15631575783081825, + "flos": 19902125116800.0, + "grad_norm": 2.1531110795537747, + "language_loss": 0.81672341, + "learning_rate": 3.835069669597655e-06, + "loss": 0.83943117, + "num_input_tokens_seen": 27471960, + "step": 1300, + "time_per_iteration": 3.353548526763916 + }, + { + "auxiliary_loss_clip": 0.01222735, + "auxiliary_loss_mlp": 0.00766352, + "balance_loss_clip": 1.0662626, + "balance_loss_mlp": 1.0013653, + "epoch": 0.15643600072145733, + "flos": 20777663128320.0, + "grad_norm": 2.1402735527529555, + "language_loss": 0.79648507, + "learning_rate": 3.834759768124603e-06, + "loss": 0.81637597, + "num_input_tokens_seen": 27490835, + "step": 1301, + "time_per_iteration": 2.532723903656006 + }, + { + "auxiliary_loss_clip": 0.01192352, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.06492293, + "balance_loss_mlp": 1.0251143, + "epoch": 0.15655624361209644, + "flos": 18546389159040.0, + "grad_norm": 2.223335978249787, + "language_loss": 0.76437336, + "learning_rate": 3.834449588319953e-06, + "loss": 0.78665245, + "num_input_tokens_seen": 27508870, + "step": 1302, + "time_per_iteration": 2.549192190170288 + }, + { + "auxiliary_loss_clip": 0.01215807, + "auxiliary_loss_mlp": 0.01040587, + "balance_loss_clip": 1.06791782, + "balance_loss_mlp": 1.0304544, + "epoch": 0.15667648650273552, + "flos": 25229544727680.0, + "grad_norm": 1.7577237529635878, + "language_loss": 0.85125482, + "learning_rate": 3.834139130230758e-06, + "loss": 0.87381887, + "num_input_tokens_seen": 27528175, + "step": 1303, + "time_per_iteration": 2.5474674701690674 + }, + { + "auxiliary_loss_clip": 0.01204918, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.06090903, + "balance_loss_mlp": 1.02859271, + "epoch": 0.1567967293933746, + "flos": 24827093769600.0, + "grad_norm": 1.8089787601871836, + "language_loss": 0.81391156, + "learning_rate": 3.833828393904117e-06, + "loss": 0.83635116, + "num_input_tokens_seen": 27548455, + "step": 1304, + "time_per_iteration": 2.595574378967285 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.05539382, + "balance_loss_mlp": 1.02206767, + "epoch": 0.15691697228401372, + "flos": 19164555244800.0, + "grad_norm": 2.3217233323079047, + "language_loss": 0.77731085, + "learning_rate": 3.833517379387165e-06, + "loss": 0.79930729, + "num_input_tokens_seen": 27564910, + "step": 1305, + "time_per_iteration": 2.5775556564331055 + }, + { + "auxiliary_loss_clip": 0.01222565, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.06769645, + "balance_loss_mlp": 1.03103673, + "epoch": 0.1570372151746528, + "flos": 24790931752320.0, + "grad_norm": 3.092072864454218, + "language_loss": 0.88825744, + "learning_rate": 3.833206086727085e-06, + "loss": 0.91090059, + "num_input_tokens_seen": 27584260, + "step": 1306, + "time_per_iteration": 2.5487661361694336 + }, + { + "auxiliary_loss_clip": 0.01190021, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.05850053, + "balance_loss_mlp": 1.02743971, + "epoch": 0.15715745806529188, + "flos": 24863650836480.0, + "grad_norm": 1.963992388615502, + "language_loss": 0.70863497, + "learning_rate": 3.8328945159710994e-06, + "loss": 0.73090994, + "num_input_tokens_seen": 27604440, + "step": 1307, + "time_per_iteration": 4.979633331298828 + }, + { + "auxiliary_loss_clip": 0.01226948, + "auxiliary_loss_mlp": 0.00765368, + "balance_loss_clip": 1.06996155, + "balance_loss_mlp": 1.00123274, + "epoch": 0.157277700955931, + "flos": 21872148491520.0, + "grad_norm": 2.3597321218678435, + "language_loss": 0.88840675, + "learning_rate": 3.832582667166473e-06, + "loss": 0.90832996, + "num_input_tokens_seen": 27624250, + "step": 1308, + "time_per_iteration": 2.510880947113037 + }, + { + "auxiliary_loss_clip": 0.01205869, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.06444168, + "balance_loss_mlp": 1.02699482, + "epoch": 0.15739794384657008, + "flos": 24533344344960.0, + "grad_norm": 1.741141653743624, + "language_loss": 0.82020366, + "learning_rate": 3.8322705403605125e-06, + "loss": 0.84264708, + "num_input_tokens_seen": 27644595, + "step": 1309, + "time_per_iteration": 2.5459489822387695 + }, + { + "auxiliary_loss_clip": 0.01195501, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.06246519, + "balance_loss_mlp": 1.02796674, + "epoch": 0.15751818673720916, + "flos": 17745329998080.0, + "grad_norm": 2.433016627240624, + "language_loss": 0.80914116, + "learning_rate": 3.831958135600568e-06, + "loss": 0.8314693, + "num_input_tokens_seen": 27662145, + "step": 1310, + "time_per_iteration": 2.481736421585083 + }, + { + "auxiliary_loss_clip": 0.01218928, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.06626499, + "balance_loss_mlp": 1.02568829, + "epoch": 0.15763842962784824, + "flos": 17858520731520.0, + "grad_norm": 1.9674667067269547, + "language_loss": 0.79463673, + "learning_rate": 3.831645452934032e-06, + "loss": 0.81717527, + "num_input_tokens_seen": 27680575, + "step": 1311, + "time_per_iteration": 2.461761474609375 + }, + { + "auxiliary_loss_clip": 0.0123525, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.07023692, + "balance_loss_mlp": 1.03440952, + "epoch": 0.15775867251848735, + "flos": 26980908059520.0, + "grad_norm": 1.8567165983109104, + "language_loss": 0.80104721, + "learning_rate": 3.831332492408336e-06, + "loss": 0.82384431, + "num_input_tokens_seen": 27701985, + "step": 1312, + "time_per_iteration": 2.5449059009552 + }, + { + "auxiliary_loss_clip": 0.01199619, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.06210828, + "balance_loss_mlp": 1.0205543, + "epoch": 0.15787891540912644, + "flos": 19240398812160.0, + "grad_norm": 2.0189939879799477, + "language_loss": 0.69460416, + "learning_rate": 3.831019254070957e-06, + "loss": 0.71690983, + "num_input_tokens_seen": 27719770, + "step": 1313, + "time_per_iteration": 2.5115060806274414 + }, + { + "auxiliary_loss_clip": 0.01175294, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.05974579, + "balance_loss_mlp": 1.02364564, + "epoch": 0.15799915829976552, + "flos": 27271102037760.0, + "grad_norm": 2.720418961676346, + "language_loss": 0.95244241, + "learning_rate": 3.8307057379694135e-06, + "loss": 0.9745326, + "num_input_tokens_seen": 27739105, + "step": 1314, + "time_per_iteration": 2.6371397972106934 + }, + { + "auxiliary_loss_clip": 0.01231751, + "auxiliary_loss_mlp": 0.01041856, + "balance_loss_clip": 1.06557298, + "balance_loss_mlp": 1.03184867, + "epoch": 0.15811940119040463, + "flos": 20405520270720.0, + "grad_norm": 2.0725347502638236, + "language_loss": 0.82151711, + "learning_rate": 3.830391944151264e-06, + "loss": 0.84425312, + "num_input_tokens_seen": 27754985, + "step": 1315, + "time_per_iteration": 2.4328408241271973 + }, + { + "auxiliary_loss_clip": 0.01200777, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.03357458, + "epoch": 0.1582396440810437, + "flos": 32599347661440.0, + "grad_norm": 1.932466968115719, + "language_loss": 0.67339623, + "learning_rate": 3.830077872664114e-06, + "loss": 0.6958369, + "num_input_tokens_seen": 27776110, + "step": 1316, + "time_per_iteration": 2.6023385524749756 + }, + { + "auxiliary_loss_clip": 0.01154208, + "auxiliary_loss_mlp": 0.01042564, + "balance_loss_clip": 1.05398107, + "balance_loss_mlp": 1.03291392, + "epoch": 0.1583598869716828, + "flos": 33800559310080.0, + "grad_norm": 1.7667233206382023, + "language_loss": 0.72862113, + "learning_rate": 3.829763523555604e-06, + "loss": 0.75058889, + "num_input_tokens_seen": 27796510, + "step": 1317, + "time_per_iteration": 2.7155137062072754 + }, + { + "auxiliary_loss_clip": 0.01211408, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.06838012, + "balance_loss_mlp": 1.02619505, + "epoch": 0.15848012986232188, + "flos": 24681332378880.0, + "grad_norm": 2.114874555651769, + "language_loss": 0.77905083, + "learning_rate": 3.829448896873423e-06, + "loss": 0.80152249, + "num_input_tokens_seen": 27815610, + "step": 1318, + "time_per_iteration": 2.543682813644409 + }, + { + "auxiliary_loss_clip": 0.01159997, + "auxiliary_loss_mlp": 0.00765157, + "balance_loss_clip": 1.06019151, + "balance_loss_mlp": 1.00129008, + "epoch": 0.158600372752961, + "flos": 22602068766720.0, + "grad_norm": 1.7690122907861627, + "language_loss": 0.79127955, + "learning_rate": 3.829133992665299e-06, + "loss": 0.81053114, + "num_input_tokens_seen": 27834735, + "step": 1319, + "time_per_iteration": 2.610743761062622 + }, + { + "auxiliary_loss_clip": 0.01205086, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.06355608, + "balance_loss_mlp": 1.02825737, + "epoch": 0.15872061564360007, + "flos": 27927944092800.0, + "grad_norm": 2.1452207925442734, + "language_loss": 0.88678765, + "learning_rate": 3.828818810979002e-06, + "loss": 0.90921885, + "num_input_tokens_seen": 27853065, + "step": 1320, + "time_per_iteration": 2.5379014015197754 + }, + { + "auxiliary_loss_clip": 0.01232221, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.07075691, + "balance_loss_mlp": 1.0312196, + "epoch": 0.15884085853423915, + "flos": 23696805525120.0, + "grad_norm": 1.8274815272764542, + "language_loss": 0.80584955, + "learning_rate": 3.8285033518623454e-06, + "loss": 0.82858193, + "num_input_tokens_seen": 27873315, + "step": 1321, + "time_per_iteration": 2.4854278564453125 + }, + { + "auxiliary_loss_clip": 0.01221472, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.06742227, + "balance_loss_mlp": 1.02958679, + "epoch": 0.15896110142487826, + "flos": 23112359331840.0, + "grad_norm": 2.692626492663524, + "language_loss": 0.81532472, + "learning_rate": 3.8281876153631845e-06, + "loss": 0.83794522, + "num_input_tokens_seen": 27890070, + "step": 1322, + "time_per_iteration": 2.507112979888916 + }, + { + "auxiliary_loss_clip": 0.01166667, + "auxiliary_loss_mlp": 0.0104213, + "balance_loss_clip": 1.05711365, + "balance_loss_mlp": 1.03118706, + "epoch": 0.15908134431551735, + "flos": 14685238632960.0, + "grad_norm": 1.883383624501825, + "language_loss": 0.64697874, + "learning_rate": 3.827871601529416e-06, + "loss": 0.66906679, + "num_input_tokens_seen": 27908590, + "step": 1323, + "time_per_iteration": 2.5850954055786133 + }, + { + "auxiliary_loss_clip": 0.01179731, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.05840039, + "balance_loss_mlp": 1.0305593, + "epoch": 0.15920158720615643, + "flos": 20193611984640.0, + "grad_norm": 1.7632585409258437, + "language_loss": 0.80515623, + "learning_rate": 3.827555310408979e-06, + "loss": 0.82735944, + "num_input_tokens_seen": 27927985, + "step": 1324, + "time_per_iteration": 2.5570547580718994 + }, + { + "auxiliary_loss_clip": 0.01181059, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.06348395, + "balance_loss_mlp": 1.02760148, + "epoch": 0.1593218300967955, + "flos": 24826626892800.0, + "grad_norm": 1.7186704152473238, + "language_loss": 0.82870293, + "learning_rate": 3.827238742049854e-06, + "loss": 0.85089451, + "num_input_tokens_seen": 27948280, + "step": 1325, + "time_per_iteration": 2.654414415359497 + }, + { + "auxiliary_loss_clip": 0.01229299, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.06540775, + "balance_loss_mlp": 1.02651358, + "epoch": 0.15944207298743462, + "flos": 28328707111680.0, + "grad_norm": 1.949110916574243, + "language_loss": 0.52068645, + "learning_rate": 3.826921896500066e-06, + "loss": 0.54335022, + "num_input_tokens_seen": 27969565, + "step": 1326, + "time_per_iteration": 2.5206069946289062 + }, + { + "auxiliary_loss_clip": 0.01190421, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.06250894, + "balance_loss_mlp": 1.02545023, + "epoch": 0.1595623158780737, + "flos": 22964838174720.0, + "grad_norm": 2.819476890216086, + "language_loss": 0.7808429, + "learning_rate": 3.826604773807678e-06, + "loss": 0.80311245, + "num_input_tokens_seen": 27987540, + "step": 1327, + "time_per_iteration": 3.3958075046539307 + }, + { + "auxiliary_loss_clip": 0.01196276, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.05713904, + "balance_loss_mlp": 1.02485597, + "epoch": 0.1596825587687128, + "flos": 19710540950400.0, + "grad_norm": 2.73269049048548, + "language_loss": 0.73612595, + "learning_rate": 3.826287374020798e-06, + "loss": 0.75844634, + "num_input_tokens_seen": 28002345, + "step": 1328, + "time_per_iteration": 2.5247321128845215 + }, + { + "auxiliary_loss_clip": 0.01231704, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.06858444, + "balance_loss_mlp": 1.02735353, + "epoch": 0.1598028016593519, + "flos": 22637727993600.0, + "grad_norm": 2.068608124183978, + "language_loss": 0.82837182, + "learning_rate": 3.825969697187575e-06, + "loss": 0.85105824, + "num_input_tokens_seen": 28021675, + "step": 1329, + "time_per_iteration": 2.4613306522369385 + }, + { + "auxiliary_loss_clip": 0.01181687, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.05808735, + "balance_loss_mlp": 1.02122116, + "epoch": 0.15992304454999098, + "flos": 20482908122880.0, + "grad_norm": 1.8308811204645985, + "language_loss": 0.69524753, + "learning_rate": 3.8256517433562015e-06, + "loss": 0.71737945, + "num_input_tokens_seen": 28039615, + "step": 1330, + "time_per_iteration": 2.5553975105285645 + }, + { + "auxiliary_loss_clip": 0.01227855, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.06534481, + "balance_loss_mlp": 1.02704883, + "epoch": 0.16004328744063007, + "flos": 17676094533120.0, + "grad_norm": 2.7546571020653694, + "language_loss": 0.91858697, + "learning_rate": 3.82533351257491e-06, + "loss": 0.94122577, + "num_input_tokens_seen": 28057565, + "step": 1331, + "time_per_iteration": 2.4413905143737793 + }, + { + "auxiliary_loss_clip": 0.0121539, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.06783652, + "balance_loss_mlp": 1.02681303, + "epoch": 0.16016353033126918, + "flos": 24098717779200.0, + "grad_norm": 1.7333625243184108, + "language_loss": 0.88734305, + "learning_rate": 3.825015004891975e-06, + "loss": 0.90986061, + "num_input_tokens_seen": 28076305, + "step": 1332, + "time_per_iteration": 2.5300896167755127 + }, + { + "auxiliary_loss_clip": 0.01211026, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.06296003, + "balance_loss_mlp": 1.02062535, + "epoch": 0.16028377322190826, + "flos": 27634841112960.0, + "grad_norm": 1.8013514686793173, + "language_loss": 0.7596792, + "learning_rate": 3.824696220355716e-06, + "loss": 0.78209519, + "num_input_tokens_seen": 28097895, + "step": 1333, + "time_per_iteration": 2.5350000858306885 + }, + { + "auxiliary_loss_clip": 0.01196898, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.06147027, + "balance_loss_mlp": 1.03393137, + "epoch": 0.16040401611254734, + "flos": 20961202648320.0, + "grad_norm": 1.5950696886137068, + "language_loss": 0.78940523, + "learning_rate": 3.824377159014491e-06, + "loss": 0.81181544, + "num_input_tokens_seen": 28118790, + "step": 1334, + "time_per_iteration": 3.5115861892700195 + }, + { + "auxiliary_loss_clip": 0.01211728, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.06505656, + "balance_loss_mlp": 1.02653742, + "epoch": 0.16052425900318643, + "flos": 21247051080960.0, + "grad_norm": 2.7081115425188123, + "language_loss": 0.85111851, + "learning_rate": 3.824057820916702e-06, + "loss": 0.87359881, + "num_input_tokens_seen": 28135995, + "step": 1335, + "time_per_iteration": 2.549170970916748 + }, + { + "auxiliary_loss_clip": 0.01201159, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.06250501, + "balance_loss_mlp": 1.02070642, + "epoch": 0.16064450189382554, + "flos": 15524004096000.0, + "grad_norm": 5.70837034613212, + "language_loss": 0.71691126, + "learning_rate": 3.8237382061107904e-06, + "loss": 0.73923886, + "num_input_tokens_seen": 28152715, + "step": 1336, + "time_per_iteration": 2.482983112335205 + }, + { + "auxiliary_loss_clip": 0.01125025, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.04826617, + "balance_loss_mlp": 1.03539991, + "epoch": 0.16076474478446462, + "flos": 21178497974400.0, + "grad_norm": 1.9038784275590552, + "language_loss": 0.78527892, + "learning_rate": 3.823418314645243e-06, + "loss": 0.80698025, + "num_input_tokens_seen": 28171590, + "step": 1337, + "time_per_iteration": 2.6402690410614014 + }, + { + "auxiliary_loss_clip": 0.01151947, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.05759811, + "balance_loss_mlp": 1.03121805, + "epoch": 0.1608849876751037, + "flos": 18366476912640.0, + "grad_norm": 2.2080051398346994, + "language_loss": 0.75382489, + "learning_rate": 3.823098146568588e-06, + "loss": 0.77574986, + "num_input_tokens_seen": 28191295, + "step": 1338, + "time_per_iteration": 2.5634398460388184 + }, + { + "auxiliary_loss_clip": 0.01211793, + "auxiliary_loss_mlp": 0.01037522, + "balance_loss_clip": 1.06369948, + "balance_loss_mlp": 1.02862263, + "epoch": 0.1610052305657428, + "flos": 29497024880640.0, + "grad_norm": 1.6013405989881024, + "language_loss": 0.71679163, + "learning_rate": 3.822777701929394e-06, + "loss": 0.73928481, + "num_input_tokens_seen": 28213120, + "step": 1339, + "time_per_iteration": 2.5468263626098633 + }, + { + "auxiliary_loss_clip": 0.01200914, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.05954421, + "balance_loss_mlp": 1.03128624, + "epoch": 0.1611254734563819, + "flos": 26797871329920.0, + "grad_norm": 1.8032768835029767, + "language_loss": 0.73576581, + "learning_rate": 3.8224569807762714e-06, + "loss": 0.75819218, + "num_input_tokens_seen": 28232440, + "step": 1340, + "time_per_iteration": 2.5195319652557373 + }, + { + "auxiliary_loss_clip": 0.01147128, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_clip": 1.05072176, + "balance_loss_mlp": 1.0315274, + "epoch": 0.16124571634702098, + "flos": 22419570741120.0, + "grad_norm": 1.8950014598778249, + "language_loss": 0.76562715, + "learning_rate": 3.822135983157873e-06, + "loss": 0.78752011, + "num_input_tokens_seen": 28251715, + "step": 1341, + "time_per_iteration": 2.598649501800537 + }, + { + "auxiliary_loss_clip": 0.01224813, + "auxiliary_loss_mlp": 0.00765386, + "balance_loss_clip": 1.06451225, + "balance_loss_mlp": 1.00135827, + "epoch": 0.16136595923766006, + "flos": 10999116103680.0, + "grad_norm": 2.945320386367759, + "language_loss": 0.84666026, + "learning_rate": 3.821814709122896e-06, + "loss": 0.86656225, + "num_input_tokens_seen": 28269765, + "step": 1342, + "time_per_iteration": 2.530895709991455 + }, + { + "auxiliary_loss_clip": 0.01196426, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.06143856, + "balance_loss_mlp": 1.02738142, + "epoch": 0.16148620212829917, + "flos": 21214983214080.0, + "grad_norm": 2.1281212295278324, + "language_loss": 0.84891212, + "learning_rate": 3.821493158720076e-06, + "loss": 0.87124407, + "num_input_tokens_seen": 28288870, + "step": 1343, + "time_per_iteration": 2.579946517944336 + }, + { + "auxiliary_loss_clip": 0.01181437, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.05649877, + "balance_loss_mlp": 1.0229609, + "epoch": 0.16160644501893826, + "flos": 16758468760320.0, + "grad_norm": 2.8835141669314845, + "language_loss": 0.73465884, + "learning_rate": 3.821171331998191e-06, + "loss": 0.75681055, + "num_input_tokens_seen": 28305400, + "step": 1344, + "time_per_iteration": 2.512822389602661 + }, + { + "auxiliary_loss_clip": 0.01112276, + "auxiliary_loss_mlp": 0.01009186, + "balance_loss_clip": 1.04560614, + "balance_loss_mlp": 1.0065279, + "epoch": 0.16172668790957734, + "flos": 64444967308800.0, + "grad_norm": 0.9347021770854987, + "language_loss": 0.5448283, + "learning_rate": 3.820849229006064e-06, + "loss": 0.5660429, + "num_input_tokens_seen": 28373150, + "step": 1345, + "time_per_iteration": 3.2853260040283203 + }, + { + "auxiliary_loss_clip": 0.01230609, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.0661428, + "balance_loss_mlp": 1.0237323, + "epoch": 0.16184693080021645, + "flos": 23257689759360.0, + "grad_norm": 2.5152504911007587, + "language_loss": 0.70680386, + "learning_rate": 3.8205268497925564e-06, + "loss": 0.72944701, + "num_input_tokens_seen": 28393620, + "step": 1346, + "time_per_iteration": 2.4712090492248535 + }, + { + "auxiliary_loss_clip": 0.01229389, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.06678486, + "balance_loss_mlp": 1.02845621, + "epoch": 0.16196717369085553, + "flos": 17451113696640.0, + "grad_norm": 2.235659299754506, + "language_loss": 0.78539979, + "learning_rate": 3.8202041944065725e-06, + "loss": 0.8080762, + "num_input_tokens_seen": 28409440, + "step": 1347, + "time_per_iteration": 2.425034761428833 + }, + { + "auxiliary_loss_clip": 0.01230157, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.06876183, + "balance_loss_mlp": 1.02648044, + "epoch": 0.16208741658149461, + "flos": 23873377806720.0, + "grad_norm": 1.7500583286342155, + "language_loss": 0.73807669, + "learning_rate": 3.819881262897061e-06, + "loss": 0.76074529, + "num_input_tokens_seen": 28427575, + "step": 1348, + "time_per_iteration": 2.4973952770233154 + }, + { + "auxiliary_loss_clip": 0.01186739, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.06578946, + "balance_loss_mlp": 1.02391839, + "epoch": 0.1622076594721337, + "flos": 25884806584320.0, + "grad_norm": 1.884018408940885, + "language_loss": 0.73562455, + "learning_rate": 3.819558055313008e-06, + "loss": 0.7578395, + "num_input_tokens_seen": 28448260, + "step": 1349, + "time_per_iteration": 2.6526899337768555 + }, + { + "auxiliary_loss_clip": 0.01218988, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.06480014, + "balance_loss_mlp": 1.03117836, + "epoch": 0.1623279023627728, + "flos": 21539759011200.0, + "grad_norm": 2.0984279190709736, + "language_loss": 0.77566063, + "learning_rate": 3.819234571703444e-06, + "loss": 0.79826134, + "num_input_tokens_seen": 28467085, + "step": 1350, + "time_per_iteration": 2.4966864585876465 + }, + { + "auxiliary_loss_clip": 0.01204774, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.06003189, + "balance_loss_mlp": 1.03030181, + "epoch": 0.1624481452534119, + "flos": 22085421494400.0, + "grad_norm": 1.8600360865161258, + "language_loss": 0.85695267, + "learning_rate": 3.8189108121174435e-06, + "loss": 0.87940693, + "num_input_tokens_seen": 28486850, + "step": 1351, + "time_per_iteration": 2.5167431831359863 + }, + { + "auxiliary_loss_clip": 0.01180566, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.06437624, + "balance_loss_mlp": 1.0297029, + "epoch": 0.16256838814405097, + "flos": 27087490690560.0, + "grad_norm": 1.8317684110019719, + "language_loss": 0.83584058, + "learning_rate": 3.818586776604118e-06, + "loss": 0.85804385, + "num_input_tokens_seen": 28507490, + "step": 1352, + "time_per_iteration": 2.5973947048187256 + }, + { + "auxiliary_loss_clip": 0.01195985, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.06215477, + "balance_loss_mlp": 1.03058279, + "epoch": 0.16268863103469008, + "flos": 20120354196480.0, + "grad_norm": 1.8695855380481676, + "language_loss": 0.61486769, + "learning_rate": 3.818262465212625e-06, + "loss": 0.63723469, + "num_input_tokens_seen": 28527615, + "step": 1353, + "time_per_iteration": 2.514058828353882 + }, + { + "auxiliary_loss_clip": 0.01205958, + "auxiliary_loss_mlp": 0.01047808, + "balance_loss_clip": 1.06521297, + "balance_loss_mlp": 1.03678751, + "epoch": 0.16280887392532917, + "flos": 18332792933760.0, + "grad_norm": 1.8389471092448109, + "language_loss": 0.77340096, + "learning_rate": 3.817937877992161e-06, + "loss": 0.79593861, + "num_input_tokens_seen": 28544910, + "step": 1354, + "time_per_iteration": 3.2517943382263184 + }, + { + "auxiliary_loss_clip": 0.01183704, + "auxiliary_loss_mlp": 0.00766408, + "balance_loss_clip": 1.05734324, + "balance_loss_mlp": 1.0012275, + "epoch": 0.16292911681596825, + "flos": 11874330892800.0, + "grad_norm": 4.189629685814826, + "language_loss": 0.85334384, + "learning_rate": 3.817613014991967e-06, + "loss": 0.87284499, + "num_input_tokens_seen": 28561050, + "step": 1355, + "time_per_iteration": 2.5084424018859863 + }, + { + "auxiliary_loss_clip": 0.01175051, + "auxiliary_loss_mlp": 0.01035481, + "balance_loss_clip": 1.05732155, + "balance_loss_mlp": 1.02522874, + "epoch": 0.16304935970660733, + "flos": 26103466627200.0, + "grad_norm": 2.079017450339228, + "language_loss": 0.76704466, + "learning_rate": 3.817287876261323e-06, + "loss": 0.78915, + "num_input_tokens_seen": 28581385, + "step": 1356, + "time_per_iteration": 2.577904462814331 + }, + { + "auxiliary_loss_clip": 0.01194618, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.06390715, + "balance_loss_mlp": 1.02274537, + "epoch": 0.16316960259724644, + "flos": 29351945848320.0, + "grad_norm": 1.9203192633913142, + "language_loss": 0.80038011, + "learning_rate": 3.816962461849553e-06, + "loss": 0.82266378, + "num_input_tokens_seen": 28603255, + "step": 1357, + "time_per_iteration": 2.577784538269043 + }, + { + "auxiliary_loss_clip": 0.01193545, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.06450117, + "balance_loss_mlp": 1.02784228, + "epoch": 0.16328984548788553, + "flos": 20886759711360.0, + "grad_norm": 1.9062865964148028, + "language_loss": 0.84846306, + "learning_rate": 3.8166367718060235e-06, + "loss": 0.87078232, + "num_input_tokens_seen": 28623145, + "step": 1358, + "time_per_iteration": 2.559354305267334 + }, + { + "auxiliary_loss_clip": 0.01209237, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.06168008, + "balance_loss_mlp": 1.02407002, + "epoch": 0.1634100883785246, + "flos": 18041090584320.0, + "grad_norm": 2.463957080596287, + "language_loss": 0.76440525, + "learning_rate": 3.816310806180139e-06, + "loss": 0.78683859, + "num_input_tokens_seen": 28641555, + "step": 1359, + "time_per_iteration": 2.459226131439209 + }, + { + "auxiliary_loss_clip": 0.01192513, + "auxiliary_loss_mlp": 0.01042625, + "balance_loss_clip": 1.0613699, + "balance_loss_mlp": 1.03256989, + "epoch": 0.16353033126916372, + "flos": 24572128055040.0, + "grad_norm": 1.6189070321330312, + "language_loss": 0.8088994, + "learning_rate": 3.81598456502135e-06, + "loss": 0.83125079, + "num_input_tokens_seen": 28661575, + "step": 1360, + "time_per_iteration": 3.468073844909668 + }, + { + "auxiliary_loss_clip": 0.01196332, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.0652957, + "balance_loss_mlp": 1.0299809, + "epoch": 0.1636505741598028, + "flos": 19892895321600.0, + "grad_norm": 2.000039107262487, + "language_loss": 0.87098432, + "learning_rate": 3.8156580483791455e-06, + "loss": 0.89335293, + "num_input_tokens_seen": 28676765, + "step": 1361, + "time_per_iteration": 3.2954471111297607 + }, + { + "auxiliary_loss_clip": 0.01230967, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.0666666, + "balance_loss_mlp": 1.025653, + "epoch": 0.16377081705044189, + "flos": 28402611344640.0, + "grad_norm": 2.5208244661310597, + "language_loss": 0.7678265, + "learning_rate": 3.815331256303059e-06, + "loss": 0.79049248, + "num_input_tokens_seen": 28696795, + "step": 1362, + "time_per_iteration": 2.5115602016448975 + }, + { + "auxiliary_loss_clip": 0.01180133, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.06296825, + "balance_loss_mlp": 1.02529657, + "epoch": 0.163891059941081, + "flos": 21908059113600.0, + "grad_norm": 2.1770114400105514, + "language_loss": 0.77557611, + "learning_rate": 3.815004188842665e-06, + "loss": 0.79773539, + "num_input_tokens_seen": 28714835, + "step": 1363, + "time_per_iteration": 2.5333497524261475 + }, + { + "auxiliary_loss_clip": 0.01193249, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.05873311, + "balance_loss_mlp": 1.02604723, + "epoch": 0.16401130283172008, + "flos": 26797619934720.0, + "grad_norm": 1.6382319331718556, + "language_loss": 0.80046415, + "learning_rate": 3.814676846047578e-06, + "loss": 0.82276446, + "num_input_tokens_seen": 28735710, + "step": 1364, + "time_per_iteration": 2.56845760345459 + }, + { + "auxiliary_loss_clip": 0.01210519, + "auxiliary_loss_mlp": 0.01043548, + "balance_loss_clip": 1.06334269, + "balance_loss_mlp": 1.03334308, + "epoch": 0.16413154572235916, + "flos": 32997417160320.0, + "grad_norm": 1.6800424987292948, + "language_loss": 0.70344496, + "learning_rate": 3.8143492279674565e-06, + "loss": 0.72598565, + "num_input_tokens_seen": 28758405, + "step": 1365, + "time_per_iteration": 2.58172607421875 + }, + { + "auxiliary_loss_clip": 0.01109381, + "auxiliary_loss_mlp": 0.01006006, + "balance_loss_clip": 1.04409742, + "balance_loss_mlp": 1.0031327, + "epoch": 0.16425178861299825, + "flos": 40113622074240.0, + "grad_norm": 0.8419780044993703, + "language_loss": 0.58476913, + "learning_rate": 3.8140213346519997e-06, + "loss": 0.60592306, + "num_input_tokens_seen": 28809000, + "step": 1366, + "time_per_iteration": 2.869281530380249 + }, + { + "auxiliary_loss_clip": 0.01167376, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.05608976, + "balance_loss_mlp": 1.02591872, + "epoch": 0.16437203150363736, + "flos": 25447486498560.0, + "grad_norm": 1.733083969531301, + "language_loss": 0.77403069, + "learning_rate": 3.813693166150948e-06, + "loss": 0.79606593, + "num_input_tokens_seen": 28829210, + "step": 1367, + "time_per_iteration": 2.5883116722106934 + }, + { + "auxiliary_loss_clip": 0.01174026, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.0583725, + "balance_loss_mlp": 1.02519977, + "epoch": 0.16449227439427644, + "flos": 23476888506240.0, + "grad_norm": 2.2346238575865276, + "language_loss": 0.85482037, + "learning_rate": 3.813364722514086e-06, + "loss": 0.87691832, + "num_input_tokens_seen": 28847545, + "step": 1368, + "time_per_iteration": 2.583721160888672 + }, + { + "auxiliary_loss_clip": 0.01210375, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.06160176, + "balance_loss_mlp": 1.02639985, + "epoch": 0.16461251728491552, + "flos": 13545217802880.0, + "grad_norm": 2.0230408962172817, + "language_loss": 0.80816555, + "learning_rate": 3.8130360037912368e-06, + "loss": 0.83063364, + "num_input_tokens_seen": 28863990, + "step": 1369, + "time_per_iteration": 2.4576709270477295 + }, + { + "auxiliary_loss_clip": 0.01210056, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.06145298, + "balance_loss_mlp": 1.03005731, + "epoch": 0.16473276017555463, + "flos": 23003298662400.0, + "grad_norm": 1.9511306278999785, + "language_loss": 0.81501842, + "learning_rate": 3.812707010032268e-06, + "loss": 0.83753335, + "num_input_tokens_seen": 28883045, + "step": 1370, + "time_per_iteration": 2.518853187561035 + }, + { + "auxiliary_loss_clip": 0.01219745, + "auxiliary_loss_mlp": 0.01040782, + "balance_loss_clip": 1.06846142, + "balance_loss_mlp": 1.03033924, + "epoch": 0.16485300306619372, + "flos": 24790680357120.0, + "grad_norm": 1.7125036783675724, + "language_loss": 0.79440933, + "learning_rate": 3.8123777412870863e-06, + "loss": 0.81701458, + "num_input_tokens_seen": 28902545, + "step": 1371, + "time_per_iteration": 2.55684232711792 + }, + { + "auxiliary_loss_clip": 0.01203217, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.06136775, + "balance_loss_mlp": 1.03152823, + "epoch": 0.1649732459568328, + "flos": 21106497162240.0, + "grad_norm": 2.1325425615849722, + "language_loss": 0.78685129, + "learning_rate": 3.812048197605643e-06, + "loss": 0.80929935, + "num_input_tokens_seen": 28921440, + "step": 1372, + "time_per_iteration": 2.525838613510132 + }, + { + "auxiliary_loss_clip": 0.01212257, + "auxiliary_loss_mlp": 0.01029226, + "balance_loss_clip": 1.06244302, + "balance_loss_mlp": 1.01880085, + "epoch": 0.16509348884747188, + "flos": 20266726118400.0, + "grad_norm": 1.8115963360665694, + "language_loss": 0.81618851, + "learning_rate": 3.8117183790379277e-06, + "loss": 0.83860332, + "num_input_tokens_seen": 28939890, + "step": 1373, + "time_per_iteration": 2.5070483684539795 + }, + { + "auxiliary_loss_clip": 0.0122795, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.06444144, + "balance_loss_mlp": 1.02419734, + "epoch": 0.165213731738111, + "flos": 11035493602560.0, + "grad_norm": 2.714810433806094, + "language_loss": 0.93467635, + "learning_rate": 3.811388285633976e-06, + "loss": 0.95730531, + "num_input_tokens_seen": 28955875, + "step": 1374, + "time_per_iteration": 2.4512100219726562 + }, + { + "auxiliary_loss_clip": 0.01171078, + "auxiliary_loss_mlp": 0.01046444, + "balance_loss_clip": 1.0563519, + "balance_loss_mlp": 1.03607297, + "epoch": 0.16533397462875007, + "flos": 29972051268480.0, + "grad_norm": 1.8133684746255796, + "language_loss": 0.61846185, + "learning_rate": 3.811057917443861e-06, + "loss": 0.6406371, + "num_input_tokens_seen": 28975140, + "step": 1375, + "time_per_iteration": 2.6408092975616455 + }, + { + "auxiliary_loss_clip": 0.01125922, + "auxiliary_loss_mlp": 0.0100883, + "balance_loss_clip": 1.04679835, + "balance_loss_mlp": 1.00600457, + "epoch": 0.16545421751938916, + "flos": 65556763027200.0, + "grad_norm": 0.8520100086531267, + "language_loss": 0.68370509, + "learning_rate": 3.8107272745177e-06, + "loss": 0.70505261, + "num_input_tokens_seen": 29047470, + "step": 1376, + "time_per_iteration": 3.237907648086548 + }, + { + "auxiliary_loss_clip": 0.01183851, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.0614059, + "balance_loss_mlp": 1.02759159, + "epoch": 0.16557446041002827, + "flos": 22492361652480.0, + "grad_norm": 1.81254315151487, + "language_loss": 0.78804159, + "learning_rate": 3.8103963569056513e-06, + "loss": 0.81025398, + "num_input_tokens_seen": 29066605, + "step": 1377, + "time_per_iteration": 2.5636985301971436 + }, + { + "auxiliary_loss_clip": 0.01190261, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.05754375, + "balance_loss_mlp": 1.02918434, + "epoch": 0.16569470330066735, + "flos": 24602723464320.0, + "grad_norm": 1.8081945780223116, + "language_loss": 0.88220656, + "learning_rate": 3.8100651646579146e-06, + "loss": 0.90450519, + "num_input_tokens_seen": 29085815, + "step": 1378, + "time_per_iteration": 2.5551843643188477 + }, + { + "auxiliary_loss_clip": 0.0119106, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.0567534, + "balance_loss_mlp": 1.03037679, + "epoch": 0.16581494619130643, + "flos": 15006207588480.0, + "grad_norm": 2.2579932200894492, + "language_loss": 0.92520642, + "learning_rate": 3.8097336978247317e-06, + "loss": 0.94752502, + "num_input_tokens_seen": 29102520, + "step": 1379, + "time_per_iteration": 2.5022709369659424 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.05739605, + "balance_loss_mlp": 1.02250028, + "epoch": 0.16593518908194552, + "flos": 17420338719360.0, + "grad_norm": 8.971629474181267, + "language_loss": 0.89236295, + "learning_rate": 3.8094019564563854e-06, + "loss": 0.91450757, + "num_input_tokens_seen": 29119450, + "step": 1380, + "time_per_iteration": 2.4834752082824707 + }, + { + "auxiliary_loss_clip": 0.01224668, + "auxiliary_loss_mlp": 0.00765699, + "balance_loss_clip": 1.06271076, + "balance_loss_mlp": 1.00107872, + "epoch": 0.16605543197258463, + "flos": 20412631163520.0, + "grad_norm": 2.052760973060971, + "language_loss": 0.75581467, + "learning_rate": 3.809069940603201e-06, + "loss": 0.77571833, + "num_input_tokens_seen": 29137405, + "step": 1381, + "time_per_iteration": 3.274097442626953 + }, + { + "auxiliary_loss_clip": 0.01183993, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.05729604, + "balance_loss_mlp": 1.02640867, + "epoch": 0.1661756748632237, + "flos": 14209745368320.0, + "grad_norm": 2.0132959896732205, + "language_loss": 0.78133857, + "learning_rate": 3.8087376503155452e-06, + "loss": 0.80354583, + "num_input_tokens_seen": 29154890, + "step": 1382, + "time_per_iteration": 2.5003859996795654 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01005909, + "balance_loss_clip": 1.0397017, + "balance_loss_mlp": 1.00326288, + "epoch": 0.1662959177538628, + "flos": 66080877350400.0, + "grad_norm": 0.8994570687113425, + "language_loss": 0.56258345, + "learning_rate": 3.808405085643826e-06, + "loss": 0.5837785, + "num_input_tokens_seen": 29219770, + "step": 1383, + "time_per_iteration": 3.1710710525512695 + }, + { + "auxiliary_loss_clip": 0.01229323, + "auxiliary_loss_mlp": 0.00764991, + "balance_loss_clip": 1.06563878, + "balance_loss_mlp": 1.00100219, + "epoch": 0.1664161606445019, + "flos": 20740567357440.0, + "grad_norm": 1.982860553189857, + "language_loss": 0.89035559, + "learning_rate": 3.8080722466384925e-06, + "loss": 0.91029871, + "num_input_tokens_seen": 29237620, + "step": 1384, + "time_per_iteration": 2.564457654953003 + }, + { + "auxiliary_loss_clip": 0.0122652, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.06080079, + "balance_loss_mlp": 1.02411914, + "epoch": 0.166536403535141, + "flos": 25260930236160.0, + "grad_norm": 2.4412553749917314, + "language_loss": 0.70852494, + "learning_rate": 3.8077391333500376e-06, + "loss": 0.73114431, + "num_input_tokens_seen": 29256760, + "step": 1385, + "time_per_iteration": 2.5124237537384033 + }, + { + "auxiliary_loss_clip": 0.01197055, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.06316423, + "balance_loss_mlp": 1.02525413, + "epoch": 0.16665664642578007, + "flos": 25447450584960.0, + "grad_norm": 1.6219565142174277, + "language_loss": 0.76914406, + "learning_rate": 3.8074057458289934e-06, + "loss": 0.79146153, + "num_input_tokens_seen": 29277450, + "step": 1386, + "time_per_iteration": 2.571053981781006 + }, + { + "auxiliary_loss_clip": 0.01197183, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.05859804, + "balance_loss_mlp": 1.022717, + "epoch": 0.16677688931641918, + "flos": 22200767043840.0, + "grad_norm": 2.1283436875111574, + "language_loss": 0.82579291, + "learning_rate": 3.807072084125934e-06, + "loss": 0.84809327, + "num_input_tokens_seen": 29299300, + "step": 1387, + "time_per_iteration": 4.2407801151275635 + }, + { + "auxiliary_loss_clip": 0.0119133, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.06075597, + "balance_loss_mlp": 1.02601099, + "epoch": 0.16689713220705826, + "flos": 16945958776320.0, + "grad_norm": 2.206606544761664, + "language_loss": 0.80379069, + "learning_rate": 3.806738148291477e-06, + "loss": 0.82607031, + "num_input_tokens_seen": 29316125, + "step": 1388, + "time_per_iteration": 3.274813413619995 + }, + { + "auxiliary_loss_clip": 0.01153644, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.05329514, + "balance_loss_mlp": 1.02517402, + "epoch": 0.16701737509769735, + "flos": 36244423923840.0, + "grad_norm": 2.0105182950254163, + "language_loss": 0.7117635, + "learning_rate": 3.8064039383762793e-06, + "loss": 0.73366505, + "num_input_tokens_seen": 29338490, + "step": 1389, + "time_per_iteration": 2.7404568195343018 + }, + { + "auxiliary_loss_clip": 0.01211217, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.0650568, + "balance_loss_mlp": 1.02581751, + "epoch": 0.16713761798833643, + "flos": 23258659426560.0, + "grad_norm": 2.0744782597917077, + "language_loss": 0.77264869, + "learning_rate": 3.8060694544310396e-06, + "loss": 0.79511809, + "num_input_tokens_seen": 29357000, + "step": 1390, + "time_per_iteration": 2.5060887336730957 + }, + { + "auxiliary_loss_clip": 0.01227837, + "auxiliary_loss_mlp": 0.01046129, + "balance_loss_clip": 1.0635972, + "balance_loss_mlp": 1.03505468, + "epoch": 0.16725786087897554, + "flos": 25302515207040.0, + "grad_norm": 2.8604823169462423, + "language_loss": 0.78473896, + "learning_rate": 3.8057346965065006e-06, + "loss": 0.80747861, + "num_input_tokens_seen": 29378230, + "step": 1391, + "time_per_iteration": 2.5625696182250977 + }, + { + "auxiliary_loss_clip": 0.01194707, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.06337333, + "balance_loss_mlp": 1.02995348, + "epoch": 0.16737810376961462, + "flos": 31831541516160.0, + "grad_norm": 1.7428483930149752, + "language_loss": 0.84235871, + "learning_rate": 3.805399664653443e-06, + "loss": 0.86470526, + "num_input_tokens_seen": 29400370, + "step": 1392, + "time_per_iteration": 2.5940401554107666 + }, + { + "auxiliary_loss_clip": 0.01228854, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.06441784, + "balance_loss_mlp": 1.02242899, + "epoch": 0.1674983466602537, + "flos": 27961843553280.0, + "grad_norm": 2.5359832136223766, + "language_loss": 0.73990822, + "learning_rate": 3.805064358922692e-06, + "loss": 0.76252645, + "num_input_tokens_seen": 29418660, + "step": 1393, + "time_per_iteration": 2.4924235343933105 + }, + { + "auxiliary_loss_clip": 0.01215762, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.0626502, + "balance_loss_mlp": 1.02334189, + "epoch": 0.16761858955089282, + "flos": 21762656858880.0, + "grad_norm": 1.6605558146107995, + "language_loss": 0.80892026, + "learning_rate": 3.8047287793651136e-06, + "loss": 0.83142078, + "num_input_tokens_seen": 29440105, + "step": 1394, + "time_per_iteration": 2.5098814964294434 + }, + { + "auxiliary_loss_clip": 0.01182484, + "auxiliary_loss_mlp": 0.01040966, + "balance_loss_clip": 1.05948305, + "balance_loss_mlp": 1.03104222, + "epoch": 0.1677388324415319, + "flos": 23805507058560.0, + "grad_norm": 1.9090367690346433, + "language_loss": 0.89084631, + "learning_rate": 3.8043929260316137e-06, + "loss": 0.91308081, + "num_input_tokens_seen": 29458260, + "step": 1395, + "time_per_iteration": 2.5944623947143555 + }, + { + "auxiliary_loss_clip": 0.01200802, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.06691086, + "balance_loss_mlp": 1.02762794, + "epoch": 0.16785907533217098, + "flos": 20558859431040.0, + "grad_norm": 2.6490667294790984, + "language_loss": 0.83648103, + "learning_rate": 3.8040567989731417e-06, + "loss": 0.85887212, + "num_input_tokens_seen": 29476205, + "step": 1396, + "time_per_iteration": 2.5344557762145996 + }, + { + "auxiliary_loss_clip": 0.01206737, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.06304944, + "balance_loss_mlp": 1.02339196, + "epoch": 0.16797931822281006, + "flos": 15669657745920.0, + "grad_norm": 2.0475327141272888, + "language_loss": 0.79928887, + "learning_rate": 3.8037203982406876e-06, + "loss": 0.82168621, + "num_input_tokens_seen": 29494370, + "step": 1397, + "time_per_iteration": 2.488276481628418 + }, + { + "auxiliary_loss_clip": 0.01226939, + "auxiliary_loss_mlp": 0.01035158, + "balance_loss_clip": 1.06578791, + "balance_loss_mlp": 1.02456009, + "epoch": 0.16809956111344918, + "flos": 16541101607040.0, + "grad_norm": 1.8410107359287795, + "language_loss": 0.73271787, + "learning_rate": 3.8033837238852835e-06, + "loss": 0.75533879, + "num_input_tokens_seen": 29511070, + "step": 1398, + "time_per_iteration": 2.436167001724243 + }, + { + "auxiliary_loss_clip": 0.01186933, + "auxiliary_loss_mlp": 0.01036228, + "balance_loss_clip": 1.05714464, + "balance_loss_mlp": 1.02669716, + "epoch": 0.16821980400408826, + "flos": 23258084808960.0, + "grad_norm": 1.7759487014664332, + "language_loss": 0.69822156, + "learning_rate": 3.8030467759580017e-06, + "loss": 0.72045314, + "num_input_tokens_seen": 29531990, + "step": 1399, + "time_per_iteration": 2.5321736335754395 + }, + { + "auxiliary_loss_clip": 0.01215549, + "auxiliary_loss_mlp": 0.01040832, + "balance_loss_clip": 1.06260562, + "balance_loss_mlp": 1.03009772, + "epoch": 0.16834004689472734, + "flos": 20774754126720.0, + "grad_norm": 1.8760981690256469, + "language_loss": 0.87315619, + "learning_rate": 3.802709554509958e-06, + "loss": 0.89572001, + "num_input_tokens_seen": 29549790, + "step": 1400, + "time_per_iteration": 2.4656083583831787 + }, + { + "auxiliary_loss_clip": 0.01194745, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.0597322, + "balance_loss_mlp": 1.02144051, + "epoch": 0.16846028978536645, + "flos": 26687302289280.0, + "grad_norm": 2.18740556275661, + "language_loss": 0.79255962, + "learning_rate": 3.8023720595923083e-06, + "loss": 0.81481344, + "num_input_tokens_seen": 29569045, + "step": 1401, + "time_per_iteration": 2.5857348442077637 + }, + { + "auxiliary_loss_clip": 0.01161763, + "auxiliary_loss_mlp": 0.01036317, + "balance_loss_clip": 1.05569124, + "balance_loss_mlp": 1.0259341, + "epoch": 0.16858053267600553, + "flos": 18843298980480.0, + "grad_norm": 1.984343811345821, + "language_loss": 0.87713242, + "learning_rate": 3.80203429125625e-06, + "loss": 0.89911318, + "num_input_tokens_seen": 29587220, + "step": 1402, + "time_per_iteration": 2.5734236240386963 + }, + { + "auxiliary_loss_clip": 0.01141583, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.05420411, + "balance_loss_mlp": 1.02434897, + "epoch": 0.16870077556664462, + "flos": 27744548227200.0, + "grad_norm": 1.7811897472461624, + "language_loss": 0.70326531, + "learning_rate": 3.8016962495530225e-06, + "loss": 0.72502381, + "num_input_tokens_seen": 29606410, + "step": 1403, + "time_per_iteration": 2.6648831367492676 + }, + { + "auxiliary_loss_clip": 0.01229687, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.0662477, + "balance_loss_mlp": 1.02994609, + "epoch": 0.1688210184572837, + "flos": 13730768484480.0, + "grad_norm": 2.2473037856654776, + "language_loss": 0.77412438, + "learning_rate": 3.8013579345339063e-06, + "loss": 0.79681587, + "num_input_tokens_seen": 29621275, + "step": 1404, + "time_per_iteration": 2.428229570388794 + }, + { + "auxiliary_loss_clip": 0.01185222, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.02195835, + "epoch": 0.1689412613479228, + "flos": 26468785900800.0, + "grad_norm": 2.128282175240056, + "language_loss": 0.69736111, + "learning_rate": 3.801019346250224e-06, + "loss": 0.7195363, + "num_input_tokens_seen": 29641420, + "step": 1405, + "time_per_iteration": 2.6064810752868652 + }, + { + "auxiliary_loss_clip": 0.01208163, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.0626018, + "balance_loss_mlp": 1.02249122, + "epoch": 0.1690615042385619, + "flos": 21138852337920.0, + "grad_norm": 4.432058064474053, + "language_loss": 0.84003842, + "learning_rate": 3.8006804847533395e-06, + "loss": 0.86244857, + "num_input_tokens_seen": 29660935, + "step": 1406, + "time_per_iteration": 2.5351383686065674 + }, + { + "auxiliary_loss_clip": 0.01230538, + "auxiliary_loss_mlp": 0.01039557, + "balance_loss_clip": 1.06615186, + "balance_loss_mlp": 1.03051496, + "epoch": 0.16918174712920098, + "flos": 20849340718080.0, + "grad_norm": 2.1460214244826257, + "language_loss": 0.8576861, + "learning_rate": 3.8003413500946556e-06, + "loss": 0.88038707, + "num_input_tokens_seen": 29681045, + "step": 1407, + "time_per_iteration": 3.298356056213379 + }, + { + "auxiliary_loss_clip": 0.01198509, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.06286693, + "balance_loss_mlp": 1.03038335, + "epoch": 0.1693019900198401, + "flos": 16983270028800.0, + "grad_norm": 2.373936961580943, + "language_loss": 0.82466704, + "learning_rate": 3.8000019423256216e-06, + "loss": 0.84706426, + "num_input_tokens_seen": 29698810, + "step": 1408, + "time_per_iteration": 2.5213544368743896 + }, + { + "auxiliary_loss_clip": 0.01187343, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_clip": 1.06205058, + "balance_loss_mlp": 1.03530288, + "epoch": 0.16942223291047917, + "flos": 26796901662720.0, + "grad_norm": 1.5668950939483715, + "language_loss": 0.87990034, + "learning_rate": 3.7996622614977234e-06, + "loss": 0.90222824, + "num_input_tokens_seen": 29720000, + "step": 1409, + "time_per_iteration": 2.570082664489746 + }, + { + "auxiliary_loss_clip": 0.01194845, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.06289458, + "balance_loss_mlp": 1.02547002, + "epoch": 0.16954247580111825, + "flos": 18583700411520.0, + "grad_norm": 1.7840986055052674, + "language_loss": 0.79178345, + "learning_rate": 3.799322307662492e-06, + "loss": 0.81408191, + "num_input_tokens_seen": 29737820, + "step": 1410, + "time_per_iteration": 2.5340983867645264 + }, + { + "auxiliary_loss_clip": 0.01170895, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.05707288, + "balance_loss_mlp": 1.02070141, + "epoch": 0.16966271869175734, + "flos": 13983651210240.0, + "grad_norm": 2.1602495070158882, + "language_loss": 0.83495837, + "learning_rate": 3.798982080871496e-06, + "loss": 0.8569786, + "num_input_tokens_seen": 29752960, + "step": 1411, + "time_per_iteration": 2.6597843170166016 + }, + { + "auxiliary_loss_clip": 0.01230922, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.06725669, + "balance_loss_mlp": 1.0287838, + "epoch": 0.16978296158239645, + "flos": 37487328284160.0, + "grad_norm": 2.186501420728542, + "language_loss": 0.67638087, + "learning_rate": 3.798641581176349e-06, + "loss": 0.69908476, + "num_input_tokens_seen": 29775240, + "step": 1412, + "time_per_iteration": 2.607224702835083 + }, + { + "auxiliary_loss_clip": 0.0119717, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.06025338, + "balance_loss_mlp": 1.03231621, + "epoch": 0.16990320447303553, + "flos": 28328958506880.0, + "grad_norm": 1.8804893321233875, + "language_loss": 0.74920118, + "learning_rate": 3.7983008086287044e-06, + "loss": 0.77160436, + "num_input_tokens_seen": 29796560, + "step": 1413, + "time_per_iteration": 2.6074204444885254 + }, + { + "auxiliary_loss_clip": 0.01192482, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.05804729, + "balance_loss_mlp": 1.02831626, + "epoch": 0.1700234473636746, + "flos": 20188189031040.0, + "grad_norm": 3.1100469396977504, + "language_loss": 0.79611427, + "learning_rate": 3.797959763280257e-06, + "loss": 0.81843024, + "num_input_tokens_seen": 29815245, + "step": 1414, + "time_per_iteration": 4.217747211456299 + }, + { + "auxiliary_loss_clip": 0.01216257, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.06441629, + "balance_loss_mlp": 1.03708172, + "epoch": 0.17014369025431372, + "flos": 24858658846080.0, + "grad_norm": 1.8866636825495138, + "language_loss": 0.79411358, + "learning_rate": 3.797618445182743e-06, + "loss": 0.81674325, + "num_input_tokens_seen": 29836640, + "step": 1415, + "time_per_iteration": 3.3129775524139404 + }, + { + "auxiliary_loss_clip": 0.01162699, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.05588186, + "balance_loss_mlp": 1.02528572, + "epoch": 0.1702639331449528, + "flos": 16467233287680.0, + "grad_norm": 1.943683991808472, + "language_loss": 0.84756851, + "learning_rate": 3.79727685438794e-06, + "loss": 0.86955333, + "num_input_tokens_seen": 29850830, + "step": 1416, + "time_per_iteration": 2.5465750694274902 + }, + { + "auxiliary_loss_clip": 0.01134974, + "auxiliary_loss_mlp": 0.01003247, + "balance_loss_clip": 1.04604411, + "balance_loss_mlp": 1.00050533, + "epoch": 0.1703841760355919, + "flos": 52508870979840.0, + "grad_norm": 0.8388428360397058, + "language_loss": 0.61706597, + "learning_rate": 3.796934990947667e-06, + "loss": 0.63844818, + "num_input_tokens_seen": 29912515, + "step": 1417, + "time_per_iteration": 3.1363086700439453 + }, + { + "auxiliary_loss_clip": 0.01133715, + "auxiliary_loss_mlp": 0.0100463, + "balance_loss_clip": 1.04555082, + "balance_loss_mlp": 1.00185287, + "epoch": 0.170504418926231, + "flos": 49370637576960.0, + "grad_norm": 0.878300632858872, + "language_loss": 0.62479937, + "learning_rate": 3.7965928549137854e-06, + "loss": 0.64618289, + "num_input_tokens_seen": 29969330, + "step": 1418, + "time_per_iteration": 2.9846107959747314 + }, + { + "auxiliary_loss_clip": 0.01185632, + "auxiliary_loss_mlp": 0.01040592, + "balance_loss_clip": 1.0556097, + "balance_loss_mlp": 1.02974439, + "epoch": 0.17062466181687008, + "flos": 25849219184640.0, + "grad_norm": 2.0942533726877905, + "language_loss": 0.77478254, + "learning_rate": 3.7962504463381953e-06, + "loss": 0.79704475, + "num_input_tokens_seen": 29990820, + "step": 1419, + "time_per_iteration": 2.6042940616607666 + }, + { + "auxiliary_loss_clip": 0.0119339, + "auxiliary_loss_mlp": 0.00766508, + "balance_loss_clip": 1.06383729, + "balance_loss_mlp": 1.00090909, + "epoch": 0.17074490470750917, + "flos": 20960412549120.0, + "grad_norm": 1.7503120276866715, + "language_loss": 0.7898913, + "learning_rate": 3.7959077652728412e-06, + "loss": 0.80949026, + "num_input_tokens_seen": 30009275, + "step": 1420, + "time_per_iteration": 2.5444934368133545 + }, + { + "auxiliary_loss_clip": 0.01196036, + "auxiliary_loss_mlp": 0.01039795, + "balance_loss_clip": 1.05955565, + "balance_loss_mlp": 1.02935195, + "epoch": 0.17086514759814825, + "flos": 20959766104320.0, + "grad_norm": 2.6277076389224443, + "language_loss": 0.77596462, + "learning_rate": 3.795564811769707e-06, + "loss": 0.79832292, + "num_input_tokens_seen": 30027630, + "step": 1421, + "time_per_iteration": 2.5280802249908447 + }, + { + "auxiliary_loss_clip": 0.0119714, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.06417918, + "balance_loss_mlp": 1.02626896, + "epoch": 0.17098539048878736, + "flos": 28474073452800.0, + "grad_norm": 1.9606844455074244, + "language_loss": 0.78223407, + "learning_rate": 3.795221585880818e-06, + "loss": 0.80458289, + "num_input_tokens_seen": 30048310, + "step": 1422, + "time_per_iteration": 2.607741117477417 + }, + { + "auxiliary_loss_clip": 0.01182946, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.06351352, + "balance_loss_mlp": 1.0280118, + "epoch": 0.17110563337942644, + "flos": 16290014561280.0, + "grad_norm": 1.8635863668084915, + "language_loss": 0.91161311, + "learning_rate": 3.794878087658242e-06, + "loss": 0.93381739, + "num_input_tokens_seen": 30066080, + "step": 1423, + "time_per_iteration": 2.5263290405273438 + }, + { + "auxiliary_loss_clip": 0.01212412, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.06093574, + "balance_loss_mlp": 1.02628374, + "epoch": 0.17122587627006552, + "flos": 29674207693440.0, + "grad_norm": 1.972740840965769, + "language_loss": 0.78389752, + "learning_rate": 3.7945343171540873e-06, + "loss": 0.80638194, + "num_input_tokens_seen": 30086955, + "step": 1424, + "time_per_iteration": 2.557713747024536 + }, + { + "auxiliary_loss_clip": 0.01229543, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.06529009, + "balance_loss_mlp": 1.02756333, + "epoch": 0.17134611916070464, + "flos": 25338389915520.0, + "grad_norm": 1.7429233529747776, + "language_loss": 0.7850129, + "learning_rate": 3.7941902744205033e-06, + "loss": 0.8076961, + "num_input_tokens_seen": 30107990, + "step": 1425, + "time_per_iteration": 2.517632246017456 + }, + { + "auxiliary_loss_clip": 0.01200463, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.06053579, + "balance_loss_mlp": 1.02386379, + "epoch": 0.17146636205134372, + "flos": 13953845900160.0, + "grad_norm": 1.8725603799353174, + "language_loss": 0.83362359, + "learning_rate": 3.7938459595096817e-06, + "loss": 0.85598022, + "num_input_tokens_seen": 30126535, + "step": 1426, + "time_per_iteration": 2.5453715324401855 + }, + { + "auxiliary_loss_clip": 0.01218655, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.06246901, + "balance_loss_mlp": 1.02697444, + "epoch": 0.1715866049419828, + "flos": 23915214172800.0, + "grad_norm": 1.749569964461334, + "language_loss": 0.86320484, + "learning_rate": 3.7935013724738545e-06, + "loss": 0.88576841, + "num_input_tokens_seen": 30147035, + "step": 1427, + "time_per_iteration": 2.524198055267334 + }, + { + "auxiliary_loss_clip": 0.01206981, + "auxiliary_loss_mlp": 0.01040626, + "balance_loss_clip": 1.06174147, + "balance_loss_mlp": 1.03084469, + "epoch": 0.17170684783262188, + "flos": 22709369669760.0, + "grad_norm": 1.7177835555974934, + "language_loss": 0.77919137, + "learning_rate": 3.7931565133652945e-06, + "loss": 0.80166739, + "num_input_tokens_seen": 30167110, + "step": 1428, + "time_per_iteration": 2.503652572631836 + }, + { + "auxiliary_loss_clip": 0.01226606, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.06393242, + "balance_loss_mlp": 1.02600861, + "epoch": 0.171827090723261, + "flos": 26613290315520.0, + "grad_norm": 2.3727193250489607, + "language_loss": 0.68187082, + "learning_rate": 3.792811382236317e-06, + "loss": 0.70450157, + "num_input_tokens_seen": 30185620, + "step": 1429, + "time_per_iteration": 2.523554801940918 + }, + { + "auxiliary_loss_clip": 0.01216202, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.06277788, + "balance_loss_mlp": 1.02280283, + "epoch": 0.17194733361390008, + "flos": 28148507556480.0, + "grad_norm": 2.075552834756765, + "language_loss": 0.78314114, + "learning_rate": 3.792465979139279e-06, + "loss": 0.80563807, + "num_input_tokens_seen": 30208225, + "step": 1430, + "time_per_iteration": 2.573277473449707 + }, + { + "auxiliary_loss_clip": 0.01100972, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.03785551, + "balance_loss_mlp": 1.02462447, + "epoch": 0.17206757650453916, + "flos": 65530689753600.0, + "grad_norm": 0.9244882717881197, + "language_loss": 0.65645969, + "learning_rate": 3.792120304126576e-06, + "loss": 0.67774385, + "num_input_tokens_seen": 30271600, + "step": 1431, + "time_per_iteration": 3.1704819202423096 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.05465043, + "balance_loss_mlp": 1.01774073, + "epoch": 0.17218781939517827, + "flos": 22273486128000.0, + "grad_norm": 1.878238450222419, + "language_loss": 0.83875877, + "learning_rate": 3.791774357250649e-06, + "loss": 0.86041683, + "num_input_tokens_seen": 30290430, + "step": 1432, + "time_per_iteration": 2.774661064147949 + }, + { + "auxiliary_loss_clip": 0.01192632, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_clip": 1.05880046, + "balance_loss_mlp": 1.03708231, + "epoch": 0.17230806228581735, + "flos": 14137313592960.0, + "grad_norm": 2.074017535049121, + "language_loss": 0.79137564, + "learning_rate": 3.7914281385639757e-06, + "loss": 0.81378484, + "num_input_tokens_seen": 30308305, + "step": 1433, + "time_per_iteration": 2.5613033771514893 + }, + { + "auxiliary_loss_clip": 0.01209972, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.05939877, + "balance_loss_mlp": 1.02413106, + "epoch": 0.17242830517645644, + "flos": 20704836303360.0, + "grad_norm": 2.020455169326723, + "language_loss": 0.79531109, + "learning_rate": 3.7910816481190784e-06, + "loss": 0.81775796, + "num_input_tokens_seen": 30328120, + "step": 1434, + "time_per_iteration": 3.3162264823913574 + }, + { + "auxiliary_loss_clip": 0.01182037, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.05602324, + "balance_loss_mlp": 1.02668166, + "epoch": 0.17254854806709552, + "flos": 30774582887040.0, + "grad_norm": 2.0229952467878256, + "language_loss": 0.74950767, + "learning_rate": 3.7907348859685193e-06, + "loss": 0.77169973, + "num_input_tokens_seen": 30349825, + "step": 1435, + "time_per_iteration": 2.602051258087158 + }, + { + "auxiliary_loss_clip": 0.01202795, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.06140232, + "balance_loss_mlp": 1.02376199, + "epoch": 0.17266879095773463, + "flos": 26614726859520.0, + "grad_norm": 1.92522314709842, + "language_loss": 0.80151463, + "learning_rate": 3.790387852164902e-06, + "loss": 0.82388783, + "num_input_tokens_seen": 30370555, + "step": 1436, + "time_per_iteration": 2.533043384552002 + }, + { + "auxiliary_loss_clip": 0.01210455, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.06172287, + "balance_loss_mlp": 1.02795076, + "epoch": 0.1727890338483737, + "flos": 20266295155200.0, + "grad_norm": 4.618461891576178, + "language_loss": 0.77113122, + "learning_rate": 3.7900405467608707e-06, + "loss": 0.79361439, + "num_input_tokens_seen": 30390100, + "step": 1437, + "time_per_iteration": 2.4939024448394775 + }, + { + "auxiliary_loss_clip": 0.01149515, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.05029559, + "balance_loss_mlp": 1.02269423, + "epoch": 0.1729092767390128, + "flos": 18179812909440.0, + "grad_norm": 3.2653280561745, + "language_loss": 0.78959757, + "learning_rate": 3.7896929698091114e-06, + "loss": 0.81142581, + "num_input_tokens_seen": 30402915, + "step": 1438, + "time_per_iteration": 2.5417556762695312 + }, + { + "auxiliary_loss_clip": 0.01228201, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.06677377, + "balance_loss_mlp": 1.03250384, + "epoch": 0.1730295196296519, + "flos": 26759518583040.0, + "grad_norm": 3.5712285430426873, + "language_loss": 0.68447709, + "learning_rate": 3.7893451213623518e-06, + "loss": 0.70718831, + "num_input_tokens_seen": 30420145, + "step": 1439, + "time_per_iteration": 2.4793314933776855 + }, + { + "auxiliary_loss_clip": 0.01210656, + "auxiliary_loss_mlp": 0.00765794, + "balance_loss_clip": 1.06375563, + "balance_loss_mlp": 1.00084448, + "epoch": 0.173149762520291, + "flos": 23842531002240.0, + "grad_norm": 2.2291654417466815, + "language_loss": 0.82187635, + "learning_rate": 3.7889970014733606e-06, + "loss": 0.84164083, + "num_input_tokens_seen": 30439250, + "step": 1440, + "time_per_iteration": 3.323448896408081 + }, + { + "auxiliary_loss_clip": 0.01146114, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.05067933, + "balance_loss_mlp": 1.02589035, + "epoch": 0.17327000541093007, + "flos": 23368186972800.0, + "grad_norm": 1.7709030682981897, + "language_loss": 0.78132027, + "learning_rate": 3.7886486101949463e-06, + "loss": 0.8031491, + "num_input_tokens_seen": 30460430, + "step": 1441, + "time_per_iteration": 4.266057252883911 + }, + { + "auxiliary_loss_clip": 0.01154186, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.05389822, + "balance_loss_mlp": 1.0355165, + "epoch": 0.17339024830156918, + "flos": 18221290139520.0, + "grad_norm": 1.9061362269322848, + "language_loss": 0.88305372, + "learning_rate": 3.7882999475799594e-06, + "loss": 0.90505677, + "num_input_tokens_seen": 30478465, + "step": 1442, + "time_per_iteration": 2.5881457328796387 + }, + { + "auxiliary_loss_clip": 0.01148973, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.05625772, + "balance_loss_mlp": 1.02933741, + "epoch": 0.17351049119220827, + "flos": 23332024955520.0, + "grad_norm": 1.8436320015938201, + "language_loss": 0.81345373, + "learning_rate": 3.787951013681293e-06, + "loss": 0.83533835, + "num_input_tokens_seen": 30496510, + "step": 1443, + "time_per_iteration": 2.612109422683716 + }, + { + "auxiliary_loss_clip": 0.0120618, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.05773818, + "balance_loss_mlp": 1.03287363, + "epoch": 0.17363073408284735, + "flos": 23803495896960.0, + "grad_norm": 1.9162495103262853, + "language_loss": 0.77306521, + "learning_rate": 3.787601808551879e-06, + "loss": 0.79556608, + "num_input_tokens_seen": 30516325, + "step": 1444, + "time_per_iteration": 2.516946792602539 + }, + { + "auxiliary_loss_clip": 0.0118117, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.05812764, + "balance_loss_mlp": 1.03485739, + "epoch": 0.17375097697348643, + "flos": 18515290959360.0, + "grad_norm": 2.439937634319878, + "language_loss": 0.83674777, + "learning_rate": 3.7872523322446926e-06, + "loss": 0.85901171, + "num_input_tokens_seen": 30535210, + "step": 1445, + "time_per_iteration": 2.5380184650421143 + }, + { + "auxiliary_loss_clip": 0.01169404, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.05203438, + "balance_loss_mlp": 1.02079868, + "epoch": 0.17387121986412554, + "flos": 38877897456000.0, + "grad_norm": 1.7687062258596444, + "language_loss": 0.60204852, + "learning_rate": 3.7869025848127478e-06, + "loss": 0.62404728, + "num_input_tokens_seen": 30559405, + "step": 1446, + "time_per_iteration": 2.8109099864959717 + }, + { + "auxiliary_loss_clip": 0.01208658, + "auxiliary_loss_mlp": 0.01038238, + "balance_loss_clip": 1.05843425, + "balance_loss_mlp": 1.02814138, + "epoch": 0.17399146275476463, + "flos": 20375714960640.0, + "grad_norm": 4.609653296830032, + "language_loss": 0.80659467, + "learning_rate": 3.786552566309102e-06, + "loss": 0.82906365, + "num_input_tokens_seen": 30577615, + "step": 1447, + "time_per_iteration": 2.563925266265869 + }, + { + "auxiliary_loss_clip": 0.01191322, + "auxiliary_loss_mlp": 0.00765469, + "balance_loss_clip": 1.06150126, + "balance_loss_mlp": 1.00075054, + "epoch": 0.1741117056454037, + "flos": 19164339763200.0, + "grad_norm": 2.155001230410101, + "language_loss": 0.86027318, + "learning_rate": 3.7862022767868517e-06, + "loss": 0.87984109, + "num_input_tokens_seen": 30595205, + "step": 1448, + "time_per_iteration": 2.5667619705200195 + }, + { + "auxiliary_loss_clip": 0.01177304, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.06241715, + "balance_loss_mlp": 1.0320673, + "epoch": 0.17423194853604282, + "flos": 25374300537600.0, + "grad_norm": 2.703550903405167, + "language_loss": 0.84045398, + "learning_rate": 3.7858517162991367e-06, + "loss": 0.86264777, + "num_input_tokens_seen": 30615280, + "step": 1449, + "time_per_iteration": 2.5943922996520996 + }, + { + "auxiliary_loss_clip": 0.01180372, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.05726457, + "balance_loss_mlp": 1.02626264, + "epoch": 0.1743521914266819, + "flos": 25191874339200.0, + "grad_norm": 2.35581001849865, + "language_loss": 0.60471541, + "learning_rate": 3.7855008848991363e-06, + "loss": 0.62688971, + "num_input_tokens_seen": 30633485, + "step": 1450, + "time_per_iteration": 2.662813901901245 + }, + { + "auxiliary_loss_clip": 0.01190967, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.06117392, + "balance_loss_mlp": 1.02910447, + "epoch": 0.17447243431732098, + "flos": 25666577504640.0, + "grad_norm": 2.1520722266441887, + "language_loss": 0.77669948, + "learning_rate": 3.7851497826400714e-06, + "loss": 0.7989974, + "num_input_tokens_seen": 30653625, + "step": 1451, + "time_per_iteration": 2.6153042316436768 + }, + { + "auxiliary_loss_clip": 0.01226882, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.06416225, + "balance_loss_mlp": 1.02949405, + "epoch": 0.17459267720796007, + "flos": 36281950657920.0, + "grad_norm": 2.3020210001543644, + "language_loss": 0.76027685, + "learning_rate": 3.7847984095752034e-06, + "loss": 0.78294468, + "num_input_tokens_seen": 30677080, + "step": 1452, + "time_per_iteration": 2.599299907684326 + }, + { + "auxiliary_loss_clip": 0.01225495, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.06297827, + "balance_loss_mlp": 1.02205372, + "epoch": 0.17471292009859918, + "flos": 20011113959040.0, + "grad_norm": 1.9298031505748081, + "language_loss": 0.80195272, + "learning_rate": 3.784446765757836e-06, + "loss": 0.82452631, + "num_input_tokens_seen": 30695725, + "step": 1453, + "time_per_iteration": 2.4770569801330566 + }, + { + "auxiliary_loss_clip": 0.01160946, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.05506372, + "balance_loss_mlp": 1.02446628, + "epoch": 0.17483316298923826, + "flos": 27819242559360.0, + "grad_norm": 2.3354274889735556, + "language_loss": 0.776748, + "learning_rate": 3.7840948512413133e-06, + "loss": 0.79870385, + "num_input_tokens_seen": 30713310, + "step": 1454, + "time_per_iteration": 2.60199236869812 + }, + { + "auxiliary_loss_clip": 0.01177273, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.06002808, + "balance_loss_mlp": 1.02616858, + "epoch": 0.17495340587987734, + "flos": 44017934791680.0, + "grad_norm": 1.89224206983156, + "language_loss": 0.78945887, + "learning_rate": 3.7837426660790196e-06, + "loss": 0.81160486, + "num_input_tokens_seen": 30734725, + "step": 1455, + "time_per_iteration": 2.738637924194336 + }, + { + "auxiliary_loss_clip": 0.01222324, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_clip": 1.0622673, + "balance_loss_mlp": 1.03334212, + "epoch": 0.17507364877051645, + "flos": 20885825957760.0, + "grad_norm": 2.4297032226263107, + "language_loss": 0.81725085, + "learning_rate": 3.783390210324382e-06, + "loss": 0.83990347, + "num_input_tokens_seen": 30754450, + "step": 1456, + "time_per_iteration": 2.4665238857269287 + }, + { + "auxiliary_loss_clip": 0.01177982, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.05898142, + "balance_loss_mlp": 1.02416682, + "epoch": 0.17519389166115554, + "flos": 24717602136960.0, + "grad_norm": 1.973457794068408, + "language_loss": 0.73072994, + "learning_rate": 3.7830374840308676e-06, + "loss": 0.75284994, + "num_input_tokens_seen": 30774605, + "step": 1457, + "time_per_iteration": 2.5830042362213135 + }, + { + "auxiliary_loss_clip": 0.01213608, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.06407142, + "balance_loss_mlp": 1.02771223, + "epoch": 0.17531413455179462, + "flos": 23798144770560.0, + "grad_norm": 2.555148700001538, + "language_loss": 0.82423353, + "learning_rate": 3.7826844872519842e-06, + "loss": 0.84675366, + "num_input_tokens_seen": 30792460, + "step": 1458, + "time_per_iteration": 2.5104057788848877 + }, + { + "auxiliary_loss_clip": 0.0119335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.06257606, + "balance_loss_mlp": 1.0265826, + "epoch": 0.1754343774424337, + "flos": 24572379450240.0, + "grad_norm": 1.8569162347459516, + "language_loss": 0.72393346, + "learning_rate": 3.782331220041282e-06, + "loss": 0.74622434, + "num_input_tokens_seen": 30812525, + "step": 1459, + "time_per_iteration": 2.5745351314544678 + }, + { + "auxiliary_loss_clip": 0.01187546, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.05800128, + "balance_loss_mlp": 1.02525711, + "epoch": 0.17555462033307281, + "flos": 18114599767680.0, + "grad_norm": 2.042219647492752, + "language_loss": 0.82882649, + "learning_rate": 3.7819776824523504e-06, + "loss": 0.85105312, + "num_input_tokens_seen": 30830390, + "step": 1460, + "time_per_iteration": 3.3179755210876465 + }, + { + "auxiliary_loss_clip": 0.01202059, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.06060779, + "balance_loss_mlp": 1.02904034, + "epoch": 0.1756748632237119, + "flos": 28366018364160.0, + "grad_norm": 1.8475730294686807, + "language_loss": 0.84003615, + "learning_rate": 3.7816238745388213e-06, + "loss": 0.8624481, + "num_input_tokens_seen": 30849935, + "step": 1461, + "time_per_iteration": 2.5845611095428467 + }, + { + "auxiliary_loss_clip": 0.01200645, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.05853176, + "balance_loss_mlp": 1.02432132, + "epoch": 0.17579510611435098, + "flos": 25732939881600.0, + "grad_norm": 1.8750138804707794, + "language_loss": 0.87164271, + "learning_rate": 3.781269796354367e-06, + "loss": 0.89398724, + "num_input_tokens_seen": 30869555, + "step": 1462, + "time_per_iteration": 2.5806846618652344 + }, + { + "auxiliary_loss_clip": 0.01195185, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.06106198, + "balance_loss_mlp": 1.02833033, + "epoch": 0.1759153490049901, + "flos": 18588081870720.0, + "grad_norm": 1.947386897657048, + "language_loss": 0.86154115, + "learning_rate": 3.7809154479527006e-06, + "loss": 0.88387203, + "num_input_tokens_seen": 30888760, + "step": 1463, + "time_per_iteration": 2.5226638317108154 + }, + { + "auxiliary_loss_clip": 0.01169522, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.05673873, + "balance_loss_mlp": 1.01911306, + "epoch": 0.17603559189562917, + "flos": 18619323724800.0, + "grad_norm": 2.001769956131492, + "language_loss": 0.84472024, + "learning_rate": 3.780560829387577e-06, + "loss": 0.86670125, + "num_input_tokens_seen": 30907260, + "step": 1464, + "time_per_iteration": 2.554675579071045 + }, + { + "auxiliary_loss_clip": 0.01128691, + "auxiliary_loss_mlp": 0.01005733, + "balance_loss_clip": 1.0426625, + "balance_loss_mlp": 1.00286007, + "epoch": 0.17615583478626826, + "flos": 60530775373440.0, + "grad_norm": 0.8495114524484088, + "language_loss": 0.57915521, + "learning_rate": 3.7802059407127915e-06, + "loss": 0.60049939, + "num_input_tokens_seen": 30965810, + "step": 1465, + "time_per_iteration": 3.0502445697784424 + }, + { + "auxiliary_loss_clip": 0.01186942, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.05628407, + "balance_loss_mlp": 1.03075504, + "epoch": 0.17627607767690734, + "flos": 23616221362560.0, + "grad_norm": 2.2109287551385486, + "language_loss": 0.85917604, + "learning_rate": 3.7798507819821797e-06, + "loss": 0.8814522, + "num_input_tokens_seen": 30982935, + "step": 1466, + "time_per_iteration": 2.5486092567443848 + }, + { + "auxiliary_loss_clip": 0.01173108, + "auxiliary_loss_mlp": 0.0104459, + "balance_loss_clip": 1.05823112, + "balance_loss_mlp": 1.03389132, + "epoch": 0.17639632056754645, + "flos": 17639070589440.0, + "grad_norm": 2.0832754892882273, + "language_loss": 0.78466392, + "learning_rate": 3.7794953532496197e-06, + "loss": 0.8068409, + "num_input_tokens_seen": 30998840, + "step": 1467, + "time_per_iteration": 4.221733093261719 + }, + { + "auxiliary_loss_clip": 0.01071842, + "auxiliary_loss_mlp": 0.00754799, + "balance_loss_clip": 1.03470135, + "balance_loss_mlp": 0.99977905, + "epoch": 0.17651656345818553, + "flos": 57932604910080.0, + "grad_norm": 0.855341136007038, + "language_loss": 0.57919025, + "learning_rate": 3.7791396545690295e-06, + "loss": 0.59745669, + "num_input_tokens_seen": 31060075, + "step": 1468, + "time_per_iteration": 3.8507862091064453 + }, + { + "auxiliary_loss_clip": 0.01210613, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.06518435, + "balance_loss_mlp": 1.02934897, + "epoch": 0.17663680634882462, + "flos": 22929502170240.0, + "grad_norm": 2.158932158853507, + "language_loss": 0.81077164, + "learning_rate": 3.7787836859943685e-06, + "loss": 0.8332665, + "num_input_tokens_seen": 31078800, + "step": 1469, + "time_per_iteration": 2.529515504837036 + }, + { + "auxiliary_loss_clip": 0.01209728, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.06318355, + "balance_loss_mlp": 1.02758694, + "epoch": 0.17675704923946373, + "flos": 22637979388800.0, + "grad_norm": 2.278284758983468, + "language_loss": 0.78876251, + "learning_rate": 3.7784274475796363e-06, + "loss": 0.81123972, + "num_input_tokens_seen": 31097430, + "step": 1470, + "time_per_iteration": 2.5154569149017334 + }, + { + "auxiliary_loss_clip": 0.01178776, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.05563188, + "balance_loss_mlp": 1.02638745, + "epoch": 0.1768772921301028, + "flos": 27126525795840.0, + "grad_norm": 1.9951574275723658, + "language_loss": 0.7625432, + "learning_rate": 3.7780709393788745e-06, + "loss": 0.78469396, + "num_input_tokens_seen": 31117905, + "step": 1471, + "time_per_iteration": 2.7140955924987793 + }, + { + "auxiliary_loss_clip": 0.01222892, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.06321156, + "balance_loss_mlp": 1.02297306, + "epoch": 0.1769975350207419, + "flos": 19172133014400.0, + "grad_norm": 4.366009816233025, + "language_loss": 0.75224197, + "learning_rate": 3.777714161446165e-06, + "loss": 0.77480567, + "num_input_tokens_seen": 31137610, + "step": 1472, + "time_per_iteration": 2.4866795539855957 + }, + { + "auxiliary_loss_clip": 0.01208347, + "auxiliary_loss_mlp": 0.01030992, + "balance_loss_clip": 1.06228113, + "balance_loss_mlp": 1.02123463, + "epoch": 0.177117777911381, + "flos": 36134932291200.0, + "grad_norm": 2.1389740925599288, + "language_loss": 0.69545299, + "learning_rate": 3.7773571138356304e-06, + "loss": 0.71784639, + "num_input_tokens_seen": 31157780, + "step": 1473, + "time_per_iteration": 2.6275858879089355 + }, + { + "auxiliary_loss_clip": 0.01150146, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.0546205, + "balance_loss_mlp": 1.01954544, + "epoch": 0.17723802080202009, + "flos": 22090593052800.0, + "grad_norm": 2.5795402931985296, + "language_loss": 0.89260077, + "learning_rate": 3.776999796601435e-06, + "loss": 0.91438657, + "num_input_tokens_seen": 31176540, + "step": 1474, + "time_per_iteration": 2.667428493499756 + }, + { + "auxiliary_loss_clip": 0.01214343, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.06284189, + "balance_loss_mlp": 1.02686131, + "epoch": 0.17735826369265917, + "flos": 30222671437440.0, + "grad_norm": 2.1946095154893928, + "language_loss": 0.7291131, + "learning_rate": 3.776642209797783e-06, + "loss": 0.75162393, + "num_input_tokens_seen": 31198370, + "step": 1475, + "time_per_iteration": 2.5917141437530518 + }, + { + "auxiliary_loss_clip": 0.01204982, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.05961418, + "balance_loss_mlp": 1.02152681, + "epoch": 0.17747850658329825, + "flos": 21397588980480.0, + "grad_norm": 2.429136957104276, + "language_loss": 0.78360379, + "learning_rate": 3.7762843534789205e-06, + "loss": 0.80597675, + "num_input_tokens_seen": 31217120, + "step": 1476, + "time_per_iteration": 2.5454788208007812 + }, + { + "auxiliary_loss_clip": 0.01198183, + "auxiliary_loss_mlp": 0.01035271, + "balance_loss_clip": 1.05950713, + "balance_loss_mlp": 1.02564454, + "epoch": 0.17759874947393736, + "flos": 16983341856000.0, + "grad_norm": 2.0581964809595625, + "language_loss": 0.88305044, + "learning_rate": 3.7759262276991343e-06, + "loss": 0.90538502, + "num_input_tokens_seen": 31234730, + "step": 1477, + "time_per_iteration": 2.535046339035034 + }, + { + "auxiliary_loss_clip": 0.01200286, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.06151652, + "balance_loss_mlp": 1.02220833, + "epoch": 0.17771899236457644, + "flos": 11546107390080.0, + "grad_norm": 2.3361819428161628, + "language_loss": 0.80585402, + "learning_rate": 3.7755678325127506e-06, + "loss": 0.82818222, + "num_input_tokens_seen": 31252410, + "step": 1478, + "time_per_iteration": 2.5008137226104736 + }, + { + "auxiliary_loss_clip": 0.011603, + "auxiliary_loss_mlp": 0.01032603, + "balance_loss_clip": 1.05909979, + "balance_loss_mlp": 1.02286983, + "epoch": 0.17783923525521553, + "flos": 18807747494400.0, + "grad_norm": 1.713092054486288, + "language_loss": 0.75895846, + "learning_rate": 3.7752091679741393e-06, + "loss": 0.78088742, + "num_input_tokens_seen": 31270200, + "step": 1479, + "time_per_iteration": 2.5827689170837402 + }, + { + "auxiliary_loss_clip": 0.01207346, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.06191838, + "balance_loss_mlp": 1.02300429, + "epoch": 0.17795947814585464, + "flos": 30408365773440.0, + "grad_norm": 4.302458204812137, + "language_loss": 0.77533615, + "learning_rate": 3.774850234137708e-06, + "loss": 0.7977432, + "num_input_tokens_seen": 31287495, + "step": 1480, + "time_per_iteration": 2.7176499366760254 + }, + { + "auxiliary_loss_clip": 0.01206287, + "auxiliary_loss_mlp": 0.01037276, + "balance_loss_clip": 1.06163311, + "balance_loss_mlp": 1.02706623, + "epoch": 0.17807972103649372, + "flos": 24389055411840.0, + "grad_norm": 2.197802986589133, + "language_loss": 0.82569242, + "learning_rate": 3.7744910310579076e-06, + "loss": 0.84812808, + "num_input_tokens_seen": 31306420, + "step": 1481, + "time_per_iteration": 2.55305552482605 + }, + { + "auxiliary_loss_clip": 0.01225942, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.06754196, + "balance_loss_mlp": 1.02240181, + "epoch": 0.1781999639271328, + "flos": 20301559332480.0, + "grad_norm": 2.1169771261957395, + "language_loss": 0.85378468, + "learning_rate": 3.774131558789229e-06, + "loss": 0.87635422, + "num_input_tokens_seen": 31325750, + "step": 1482, + "time_per_iteration": 2.4669079780578613 + }, + { + "auxiliary_loss_clip": 0.01224661, + "auxiliary_loss_mlp": 0.00764944, + "balance_loss_clip": 1.06543303, + "balance_loss_mlp": 1.00091124, + "epoch": 0.1783202068177719, + "flos": 15924479806080.0, + "grad_norm": 2.5119747375358408, + "language_loss": 0.69803882, + "learning_rate": 3.773771817386203e-06, + "loss": 0.71793485, + "num_input_tokens_seen": 31343080, + "step": 1483, + "time_per_iteration": 2.470374822616577 + }, + { + "auxiliary_loss_clip": 0.01192895, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.06067371, + "balance_loss_mlp": 1.02302694, + "epoch": 0.178440449708411, + "flos": 20631758083200.0, + "grad_norm": 1.4890868257939718, + "language_loss": 0.79691088, + "learning_rate": 3.773411806903403e-06, + "loss": 0.81916225, + "num_input_tokens_seen": 31362160, + "step": 1484, + "time_per_iteration": 2.5353639125823975 + }, + { + "auxiliary_loss_clip": 0.01152099, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.05368328, + "balance_loss_mlp": 1.02650261, + "epoch": 0.17856069259905008, + "flos": 21686059105920.0, + "grad_norm": 1.6847964208177657, + "language_loss": 0.94846934, + "learning_rate": 3.7730515273954415e-06, + "loss": 0.97035861, + "num_input_tokens_seen": 31380770, + "step": 1485, + "time_per_iteration": 2.637413263320923 + }, + { + "auxiliary_loss_clip": 0.01224606, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.06681061, + "balance_loss_mlp": 1.0253253, + "epoch": 0.17868093548968916, + "flos": 26572962320640.0, + "grad_norm": 1.8637740361500366, + "language_loss": 0.8491075, + "learning_rate": 3.772690978916973e-06, + "loss": 0.87169778, + "num_input_tokens_seen": 31400525, + "step": 1486, + "time_per_iteration": 2.5730369091033936 + }, + { + "auxiliary_loss_clip": 0.01209227, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.06364071, + "balance_loss_mlp": 1.02794051, + "epoch": 0.17880117838032827, + "flos": 18581006891520.0, + "grad_norm": 1.9737619373313253, + "language_loss": 0.86346591, + "learning_rate": 3.772330161522693e-06, + "loss": 0.88593858, + "num_input_tokens_seen": 31418435, + "step": 1487, + "time_per_iteration": 3.3004579544067383 + }, + { + "auxiliary_loss_clip": 0.01193917, + "auxiliary_loss_mlp": 0.01037744, + "balance_loss_clip": 1.06570292, + "balance_loss_mlp": 1.0276525, + "epoch": 0.17892142127096736, + "flos": 26541217676160.0, + "grad_norm": 1.9298624089719283, + "language_loss": 0.80043507, + "learning_rate": 3.7719690752673365e-06, + "loss": 0.82275164, + "num_input_tokens_seen": 31439230, + "step": 1488, + "time_per_iteration": 2.592899799346924 + }, + { + "auxiliary_loss_clip": 0.01183723, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.06366706, + "balance_loss_mlp": 1.02635431, + "epoch": 0.17904166416160644, + "flos": 23872623621120.0, + "grad_norm": 2.047057719401131, + "language_loss": 0.7816599, + "learning_rate": 3.7716077202056796e-06, + "loss": 0.80385721, + "num_input_tokens_seen": 31457705, + "step": 1489, + "time_per_iteration": 2.610059976577759 + }, + { + "auxiliary_loss_clip": 0.01179867, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.05894876, + "balance_loss_mlp": 1.02436733, + "epoch": 0.17916190705224552, + "flos": 19134426712320.0, + "grad_norm": 2.740640488183298, + "language_loss": 0.93789661, + "learning_rate": 3.7712460963925404e-06, + "loss": 0.9600336, + "num_input_tokens_seen": 31473645, + "step": 1490, + "time_per_iteration": 2.505293846130371 + }, + { + "auxiliary_loss_clip": 0.01185243, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.05834007, + "balance_loss_mlp": 1.02176023, + "epoch": 0.17928214994288463, + "flos": 25152120961920.0, + "grad_norm": 2.267990210001713, + "language_loss": 0.75221699, + "learning_rate": 3.7708842038827775e-06, + "loss": 0.77438414, + "num_input_tokens_seen": 31492605, + "step": 1491, + "time_per_iteration": 2.5933423042297363 + }, + { + "auxiliary_loss_clip": 0.01207974, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.06115985, + "balance_loss_mlp": 1.02586687, + "epoch": 0.17940239283352372, + "flos": 22384629786240.0, + "grad_norm": 1.8536466907740017, + "language_loss": 0.85825455, + "learning_rate": 3.770522042731288e-06, + "loss": 0.88068473, + "num_input_tokens_seen": 31514500, + "step": 1492, + "time_per_iteration": 2.542797803878784 + }, + { + "auxiliary_loss_clip": 0.01156358, + "auxiliary_loss_mlp": 0.01046, + "balance_loss_clip": 1.05834937, + "balance_loss_mlp": 1.03580189, + "epoch": 0.1795226357241628, + "flos": 23178685795200.0, + "grad_norm": 1.7988507727588825, + "language_loss": 0.87881899, + "learning_rate": 3.7701596129930122e-06, + "loss": 0.90084255, + "num_input_tokens_seen": 31533225, + "step": 1493, + "time_per_iteration": 4.280670404434204 + }, + { + "auxiliary_loss_clip": 0.01187977, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.06107438, + "balance_loss_mlp": 1.02035737, + "epoch": 0.1796428786148019, + "flos": 22090413484800.0, + "grad_norm": 2.064594396199253, + "language_loss": 0.73572993, + "learning_rate": 3.7697969147229315e-06, + "loss": 0.75792074, + "num_input_tokens_seen": 31551385, + "step": 1494, + "time_per_iteration": 2.570464611053467 + }, + { + "auxiliary_loss_clip": 0.01205212, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.06118536, + "balance_loss_mlp": 1.02566493, + "epoch": 0.179763121505441, + "flos": 21324618501120.0, + "grad_norm": 2.103074518000646, + "language_loss": 0.85422015, + "learning_rate": 3.7694339479760647e-06, + "loss": 0.87662673, + "num_input_tokens_seen": 31570415, + "step": 1495, + "time_per_iteration": 3.258523464202881 + }, + { + "auxiliary_loss_clip": 0.01113256, + "auxiliary_loss_mlp": 0.01003051, + "balance_loss_clip": 1.03997636, + "balance_loss_mlp": 1.00044, + "epoch": 0.17988336439608008, + "flos": 68161864815360.0, + "grad_norm": 4.514920265727813, + "language_loss": 0.57378691, + "learning_rate": 3.769070712807476e-06, + "loss": 0.5949499, + "num_input_tokens_seen": 31632445, + "step": 1496, + "time_per_iteration": 3.197551965713501 + }, + { + "auxiliary_loss_clip": 0.01137732, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.05715156, + "balance_loss_mlp": 1.02822709, + "epoch": 0.18000360728671919, + "flos": 21945047143680.0, + "grad_norm": 1.764423469370373, + "language_loss": 0.7895304, + "learning_rate": 3.768707209272266e-06, + "loss": 0.81128979, + "num_input_tokens_seen": 31652575, + "step": 1497, + "time_per_iteration": 2.6191747188568115 + }, + { + "auxiliary_loss_clip": 0.01190722, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.06033218, + "balance_loss_mlp": 1.02738142, + "epoch": 0.18012385017735827, + "flos": 18986330937600.0, + "grad_norm": 2.0468522117321974, + "language_loss": 0.76846707, + "learning_rate": 3.768343437425579e-06, + "loss": 0.79074752, + "num_input_tokens_seen": 31671145, + "step": 1498, + "time_per_iteration": 2.5120108127593994 + }, + { + "auxiliary_loss_clip": 0.01126844, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.05371094, + "balance_loss_mlp": 1.0236342, + "epoch": 0.18024409306799735, + "flos": 19748103598080.0, + "grad_norm": 2.3269925917180316, + "language_loss": 0.8611154, + "learning_rate": 3.7679793973225987e-06, + "loss": 0.88271976, + "num_input_tokens_seen": 31686955, + "step": 1499, + "time_per_iteration": 2.649259090423584 + }, + { + "auxiliary_loss_clip": 0.0107695, + "auxiliary_loss_mlp": 0.01003823, + "balance_loss_clip": 1.03430009, + "balance_loss_mlp": 1.00114083, + "epoch": 0.18036433595863643, + "flos": 67227183060480.0, + "grad_norm": 0.8480529418827248, + "language_loss": 0.61628759, + "learning_rate": 3.767615089018549e-06, + "loss": 0.63709533, + "num_input_tokens_seen": 31749300, + "step": 1500, + "time_per_iteration": 3.1692075729370117 + }, + { + "auxiliary_loss_clip": 0.01188465, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.05985451, + "balance_loss_mlp": 1.02629113, + "epoch": 0.18048457884927555, + "flos": 18181464935040.0, + "grad_norm": 2.2498618680409166, + "language_loss": 0.86419702, + "learning_rate": 3.7672505125686966e-06, + "loss": 0.88644719, + "num_input_tokens_seen": 31765665, + "step": 1501, + "time_per_iteration": 2.505552053451538 + }, + { + "auxiliary_loss_clip": 0.01164955, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.05516553, + "balance_loss_mlp": 1.02716851, + "epoch": 0.18060482173991463, + "flos": 15813767111040.0, + "grad_norm": 3.0057506041281044, + "language_loss": 0.84138823, + "learning_rate": 3.7668856680283455e-06, + "loss": 0.86340821, + "num_input_tokens_seen": 31782690, + "step": 1502, + "time_per_iteration": 2.561558246612549 + }, + { + "auxiliary_loss_clip": 0.01199503, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.06205773, + "balance_loss_mlp": 1.02741385, + "epoch": 0.1807250646305537, + "flos": 18587399512320.0, + "grad_norm": 1.7969620063242147, + "language_loss": 0.82497483, + "learning_rate": 3.7665205554528437e-06, + "loss": 0.84734225, + "num_input_tokens_seen": 31802045, + "step": 1503, + "time_per_iteration": 2.5025320053100586 + }, + { + "auxiliary_loss_clip": 0.01199605, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.02281022, + "epoch": 0.18084530752119282, + "flos": 23149131880320.0, + "grad_norm": 1.7578252908519156, + "language_loss": 0.74376667, + "learning_rate": 3.7661551748975782e-06, + "loss": 0.76608825, + "num_input_tokens_seen": 31820220, + "step": 1504, + "time_per_iteration": 2.5450692176818848 + }, + { + "auxiliary_loss_clip": 0.01104328, + "auxiliary_loss_mlp": 0.01003541, + "balance_loss_clip": 1.03164744, + "balance_loss_mlp": 1.00079918, + "epoch": 0.1809655504118319, + "flos": 59803153568640.0, + "grad_norm": 0.9104568629095467, + "language_loss": 0.60551387, + "learning_rate": 3.7657895264179772e-06, + "loss": 0.62659252, + "num_input_tokens_seen": 31876195, + "step": 1505, + "time_per_iteration": 3.0797336101531982 + }, + { + "auxiliary_loss_clip": 0.01184072, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.05677164, + "balance_loss_mlp": 1.02409005, + "epoch": 0.181085793302471, + "flos": 44201941188480.0, + "grad_norm": 2.9132779773083035, + "language_loss": 0.74252313, + "learning_rate": 3.765423610069509e-06, + "loss": 0.76469862, + "num_input_tokens_seen": 31901585, + "step": 1506, + "time_per_iteration": 2.7813596725463867 + }, + { + "auxiliary_loss_clip": 0.01196176, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.06472647, + "balance_loss_mlp": 1.02733707, + "epoch": 0.18120603619311007, + "flos": 34898384638080.0, + "grad_norm": 1.830509107320071, + "language_loss": 0.72516954, + "learning_rate": 3.765057425907683e-06, + "loss": 0.74750376, + "num_input_tokens_seen": 31923045, + "step": 1507, + "time_per_iteration": 2.6557724475860596 + }, + { + "auxiliary_loss_clip": 0.01210801, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.0616945, + "balance_loss_mlp": 1.02778554, + "epoch": 0.18132627908374918, + "flos": 21506757390720.0, + "grad_norm": 1.8188248935205904, + "language_loss": 0.78347516, + "learning_rate": 3.764690973988048e-06, + "loss": 0.80596387, + "num_input_tokens_seen": 31943385, + "step": 1508, + "time_per_iteration": 2.527029037475586 + }, + { + "auxiliary_loss_clip": 0.01181735, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.05971861, + "balance_loss_mlp": 1.02176225, + "epoch": 0.18144652197438826, + "flos": 29057693633280.0, + "grad_norm": 2.188100403863643, + "language_loss": 0.74156475, + "learning_rate": 3.7643242543661967e-06, + "loss": 0.76369631, + "num_input_tokens_seen": 31966045, + "step": 1509, + "time_per_iteration": 2.610428810119629 + }, + { + "auxiliary_loss_clip": 0.0109427, + "auxiliary_loss_mlp": 0.01010746, + "balance_loss_clip": 1.02890992, + "balance_loss_mlp": 1.00825465, + "epoch": 0.18156676486502735, + "flos": 68675064382080.0, + "grad_norm": 0.8151927676223004, + "language_loss": 0.60522521, + "learning_rate": 3.7639572670977573e-06, + "loss": 0.6262753, + "num_input_tokens_seen": 32021540, + "step": 1510, + "time_per_iteration": 3.000666618347168 + }, + { + "auxiliary_loss_clip": 0.01179659, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.05718935, + "balance_loss_mlp": 1.02893996, + "epoch": 0.18168700775566646, + "flos": 26471515334400.0, + "grad_norm": 1.5609653937695833, + "language_loss": 0.76758742, + "learning_rate": 3.7635900122384042e-06, + "loss": 0.78977394, + "num_input_tokens_seen": 32044535, + "step": 1511, + "time_per_iteration": 2.663465738296509 + }, + { + "auxiliary_loss_clip": 0.01195938, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.05869639, + "balance_loss_mlp": 1.0290246, + "epoch": 0.18180725064630554, + "flos": 15005668884480.0, + "grad_norm": 3.4339188296995973, + "language_loss": 0.8700766, + "learning_rate": 3.7632224898438477e-06, + "loss": 0.89243615, + "num_input_tokens_seen": 32061010, + "step": 1512, + "time_per_iteration": 2.4977450370788574 + }, + { + "auxiliary_loss_clip": 0.0118461, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.05840421, + "balance_loss_mlp": 1.02357912, + "epoch": 0.18192749353694462, + "flos": 19682387665920.0, + "grad_norm": 1.4877012113122992, + "language_loss": 0.79553455, + "learning_rate": 3.762854699969842e-06, + "loss": 0.81771398, + "num_input_tokens_seen": 32081520, + "step": 1513, + "time_per_iteration": 3.386384963989258 + }, + { + "auxiliary_loss_clip": 0.01206675, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_clip": 1.0639956, + "balance_loss_mlp": 1.0308243, + "epoch": 0.1820477364275837, + "flos": 20702717400960.0, + "grad_norm": 1.8971641765236287, + "language_loss": 0.73319352, + "learning_rate": 3.762486642672179e-06, + "loss": 0.75567847, + "num_input_tokens_seen": 32098460, + "step": 1514, + "time_per_iteration": 2.507464647293091 + }, + { + "auxiliary_loss_clip": 0.01190944, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.05835521, + "balance_loss_mlp": 1.0280962, + "epoch": 0.18216797931822282, + "flos": 17128708197120.0, + "grad_norm": 2.0553824890871986, + "language_loss": 0.87007093, + "learning_rate": 3.7621183180066946e-06, + "loss": 0.89236081, + "num_input_tokens_seen": 32116420, + "step": 1515, + "time_per_iteration": 2.5627338886260986 + }, + { + "auxiliary_loss_clip": 0.01191499, + "auxiliary_loss_mlp": 0.01035312, + "balance_loss_clip": 1.05834126, + "balance_loss_mlp": 1.02504253, + "epoch": 0.1822882222088619, + "flos": 29242561956480.0, + "grad_norm": 1.5215141883411794, + "language_loss": 0.7374227, + "learning_rate": 3.7617497260292625e-06, + "loss": 0.75969082, + "num_input_tokens_seen": 32138475, + "step": 1516, + "time_per_iteration": 2.6125621795654297 + }, + { + "auxiliary_loss_clip": 0.01187473, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.06143785, + "balance_loss_mlp": 1.02330732, + "epoch": 0.18240846509950098, + "flos": 17702739446400.0, + "grad_norm": 2.605499572940373, + "language_loss": 0.78701055, + "learning_rate": 3.7613808667957967e-06, + "loss": 0.80922544, + "num_input_tokens_seen": 32151165, + "step": 1517, + "time_per_iteration": 2.476233720779419 + }, + { + "auxiliary_loss_clip": 0.01194993, + "auxiliary_loss_mlp": 0.01044304, + "balance_loss_clip": 1.06038189, + "balance_loss_mlp": 1.03433812, + "epoch": 0.1825287079901401, + "flos": 14790025584000.0, + "grad_norm": 2.1357697738897183, + "language_loss": 0.90924817, + "learning_rate": 3.7610117403622547e-06, + "loss": 0.93164116, + "num_input_tokens_seen": 32167725, + "step": 1518, + "time_per_iteration": 2.5070536136627197 + }, + { + "auxiliary_loss_clip": 0.01169235, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.05352306, + "balance_loss_mlp": 1.0300144, + "epoch": 0.18264895088077918, + "flos": 21946232292480.0, + "grad_norm": 1.681642278010146, + "language_loss": 0.89922565, + "learning_rate": 3.7606423467846313e-06, + "loss": 0.92132354, + "num_input_tokens_seen": 32187330, + "step": 1519, + "time_per_iteration": 3.410222291946411 + }, + { + "auxiliary_loss_clip": 0.01184287, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.06102538, + "balance_loss_mlp": 1.03022909, + "epoch": 0.18276919377141826, + "flos": 20886759711360.0, + "grad_norm": 1.434769108150616, + "language_loss": 0.7946586, + "learning_rate": 3.760272686118964e-06, + "loss": 0.81690288, + "num_input_tokens_seen": 32205550, + "step": 1520, + "time_per_iteration": 3.417752981185913 + }, + { + "auxiliary_loss_clip": 0.01193656, + "auxiliary_loss_mlp": 0.01038381, + "balance_loss_clip": 1.05914736, + "balance_loss_mlp": 1.02868962, + "epoch": 0.18288943666205737, + "flos": 21469877101440.0, + "grad_norm": 2.016186871273352, + "language_loss": 0.92624295, + "learning_rate": 3.7599027584213297e-06, + "loss": 0.94856334, + "num_input_tokens_seen": 32224430, + "step": 1521, + "time_per_iteration": 3.2623822689056396 + }, + { + "auxiliary_loss_clip": 0.01211489, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.06056035, + "balance_loss_mlp": 1.02835262, + "epoch": 0.18300967955269645, + "flos": 21539363961600.0, + "grad_norm": 1.9658953811751863, + "language_loss": 0.7793622, + "learning_rate": 3.7595325637478465e-06, + "loss": 0.80186266, + "num_input_tokens_seen": 32242455, + "step": 1522, + "time_per_iteration": 2.48207950592041 + }, + { + "auxiliary_loss_clip": 0.01183828, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.06006742, + "balance_loss_mlp": 1.03318357, + "epoch": 0.18312992244333554, + "flos": 28876237102080.0, + "grad_norm": 2.0363307076798165, + "language_loss": 0.81768596, + "learning_rate": 3.7591621021546723e-06, + "loss": 0.83997154, + "num_input_tokens_seen": 32264450, + "step": 1523, + "time_per_iteration": 2.598876714706421 + }, + { + "auxiliary_loss_clip": 0.01198881, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.05824828, + "balance_loss_mlp": 1.02474356, + "epoch": 0.18325016533397462, + "flos": 20120102801280.0, + "grad_norm": 1.7008505788703276, + "language_loss": 0.81537443, + "learning_rate": 3.7587913736980062e-06, + "loss": 0.83772361, + "num_input_tokens_seen": 32284090, + "step": 1524, + "time_per_iteration": 2.50065016746521 + }, + { + "auxiliary_loss_clip": 0.01130617, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.04904747, + "balance_loss_mlp": 1.02687693, + "epoch": 0.18337040822461373, + "flos": 23329187781120.0, + "grad_norm": 1.6063378258604248, + "language_loss": 0.84336019, + "learning_rate": 3.7584203784340865e-06, + "loss": 0.86503613, + "num_input_tokens_seen": 32303260, + "step": 1525, + "time_per_iteration": 2.6171212196350098 + }, + { + "auxiliary_loss_clip": 0.01187262, + "auxiliary_loss_mlp": 0.01037775, + "balance_loss_clip": 1.05623412, + "balance_loss_mlp": 1.02760017, + "epoch": 0.1834906511152528, + "flos": 25009555881600.0, + "grad_norm": 2.1812027389513156, + "language_loss": 0.85733509, + "learning_rate": 3.7580491164191938e-06, + "loss": 0.8795855, + "num_input_tokens_seen": 32321570, + "step": 1526, + "time_per_iteration": 2.5753986835479736 + }, + { + "auxiliary_loss_clip": 0.01109682, + "auxiliary_loss_mlp": 0.01004203, + "balance_loss_clip": 1.02729988, + "balance_loss_mlp": 1.00154424, + "epoch": 0.1836108940058919, + "flos": 67251493589760.0, + "grad_norm": 0.8076315705857933, + "language_loss": 0.61306846, + "learning_rate": 3.757677587709648e-06, + "loss": 0.63420737, + "num_input_tokens_seen": 32384835, + "step": 1527, + "time_per_iteration": 3.209385871887207 + }, + { + "auxiliary_loss_clip": 0.01172461, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.05861878, + "balance_loss_mlp": 1.02675009, + "epoch": 0.183731136896531, + "flos": 25738721971200.0, + "grad_norm": 2.1933410267974964, + "language_loss": 0.75526786, + "learning_rate": 3.7573057923618095e-06, + "loss": 0.77735949, + "num_input_tokens_seen": 32404930, + "step": 1528, + "time_per_iteration": 2.6907362937927246 + }, + { + "auxiliary_loss_clip": 0.01158788, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.05195332, + "balance_loss_mlp": 1.02396226, + "epoch": 0.1838513797871701, + "flos": 20449403712000.0, + "grad_norm": 2.6770094243595057, + "language_loss": 0.74216211, + "learning_rate": 3.7569337304320793e-06, + "loss": 0.76409739, + "num_input_tokens_seen": 32424515, + "step": 1529, + "time_per_iteration": 2.6035349369049072 + }, + { + "auxiliary_loss_clip": 0.01091968, + "auxiliary_loss_mlp": 0.01002657, + "balance_loss_clip": 1.02373588, + "balance_loss_mlp": 1.00010586, + "epoch": 0.18397162267780917, + "flos": 68565141786240.0, + "grad_norm": 2.002893437752971, + "language_loss": 0.64455068, + "learning_rate": 3.756561401976899e-06, + "loss": 0.66549695, + "num_input_tokens_seen": 32484220, + "step": 1530, + "time_per_iteration": 2.99403715133667 + }, + { + "auxiliary_loss_clip": 0.01224167, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.06396341, + "balance_loss_mlp": 1.02453113, + "epoch": 0.18409186556844825, + "flos": 31941104976000.0, + "grad_norm": 2.902315012913277, + "language_loss": 0.82708448, + "learning_rate": 3.7561888070527514e-06, + "loss": 0.84967256, + "num_input_tokens_seen": 32506260, + "step": 1531, + "time_per_iteration": 2.581179618835449 + }, + { + "auxiliary_loss_clip": 0.01159704, + "auxiliary_loss_mlp": 0.00764749, + "balance_loss_clip": 1.05558276, + "balance_loss_mlp": 1.00165319, + "epoch": 0.18421210845908736, + "flos": 20120533764480.0, + "grad_norm": 2.304512789812378, + "language_loss": 0.79688239, + "learning_rate": 3.7558159457161577e-06, + "loss": 0.81612694, + "num_input_tokens_seen": 32524225, + "step": 1532, + "time_per_iteration": 2.5616469383239746 + }, + { + "auxiliary_loss_clip": 0.01195696, + "auxiliary_loss_mlp": 0.00765474, + "balance_loss_clip": 1.06339979, + "balance_loss_mlp": 1.00163865, + "epoch": 0.18433235134972645, + "flos": 23110491824640.0, + "grad_norm": 2.171485483059511, + "language_loss": 0.78154618, + "learning_rate": 3.755442818023681e-06, + "loss": 0.80115783, + "num_input_tokens_seen": 32543850, + "step": 1533, + "time_per_iteration": 2.5394325256347656 + }, + { + "auxiliary_loss_clip": 0.01180838, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.0600965, + "balance_loss_mlp": 1.02269816, + "epoch": 0.18445259424036553, + "flos": 18291351617280.0, + "grad_norm": 2.746214856149023, + "language_loss": 0.75996596, + "learning_rate": 3.7550694240319246e-06, + "loss": 0.78209633, + "num_input_tokens_seen": 32561725, + "step": 1534, + "time_per_iteration": 2.5386745929718018 + }, + { + "auxiliary_loss_clip": 0.01209616, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.06069005, + "balance_loss_mlp": 1.02284217, + "epoch": 0.18457283713100464, + "flos": 21324079797120.0, + "grad_norm": 2.347459196984291, + "language_loss": 0.76074064, + "learning_rate": 3.7546957637975326e-06, + "loss": 0.78316116, + "num_input_tokens_seen": 32579135, + "step": 1535, + "time_per_iteration": 2.4987564086914062 + }, + { + "auxiliary_loss_clip": 0.01133956, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.04652584, + "balance_loss_mlp": 1.02533722, + "epoch": 0.18469308002164372, + "flos": 20375679047040.0, + "grad_norm": 1.5648573358390365, + "language_loss": 0.73890042, + "learning_rate": 3.7543218373771873e-06, + "loss": 0.76059109, + "num_input_tokens_seen": 32598460, + "step": 1536, + "time_per_iteration": 2.6410598754882812 + }, + { + "auxiliary_loss_clip": 0.01138834, + "auxiliary_loss_mlp": 0.00764842, + "balance_loss_clip": 1.05341995, + "balance_loss_mlp": 1.00161314, + "epoch": 0.1848133229122828, + "flos": 26435892021120.0, + "grad_norm": 1.4305178161697758, + "language_loss": 0.78245389, + "learning_rate": 3.753947644827615e-06, + "loss": 0.8014906, + "num_input_tokens_seen": 32621920, + "step": 1537, + "time_per_iteration": 2.7086331844329834 + }, + { + "auxiliary_loss_clip": 0.01097863, + "auxiliary_loss_mlp": 0.01007098, + "balance_loss_clip": 1.02693629, + "balance_loss_mlp": 1.00420141, + "epoch": 0.1849335658029219, + "flos": 70547447612160.0, + "grad_norm": 0.9466289929534218, + "language_loss": 0.57233226, + "learning_rate": 3.753573186205579e-06, + "loss": 0.59338188, + "num_input_tokens_seen": 32690040, + "step": 1538, + "time_per_iteration": 3.2292301654815674 + }, + { + "auxiliary_loss_clip": 0.01178675, + "auxiliary_loss_mlp": 0.00764978, + "balance_loss_clip": 1.05461168, + "balance_loss_mlp": 1.00152361, + "epoch": 0.185053808693561, + "flos": 17384140788480.0, + "grad_norm": 2.3997511599309775, + "language_loss": 0.77776992, + "learning_rate": 3.753198461567885e-06, + "loss": 0.79720652, + "num_input_tokens_seen": 32707285, + "step": 1539, + "time_per_iteration": 2.5214316844940186 + }, + { + "auxiliary_loss_clip": 0.01172123, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.06048667, + "balance_loss_mlp": 1.0299505, + "epoch": 0.18517405158420008, + "flos": 28986159697920.0, + "grad_norm": 1.7380681267525369, + "language_loss": 0.91797101, + "learning_rate": 3.7528234709713783e-06, + "loss": 0.94008756, + "num_input_tokens_seen": 32730030, + "step": 1540, + "time_per_iteration": 3.4087469577789307 + }, + { + "auxiliary_loss_clip": 0.01207114, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.06225419, + "balance_loss_mlp": 1.02755165, + "epoch": 0.18529429447483917, + "flos": 26794962328320.0, + "grad_norm": 2.1009670548790442, + "language_loss": 0.84451687, + "learning_rate": 3.7524482144729447e-06, + "loss": 0.86695945, + "num_input_tokens_seen": 32749485, + "step": 1541, + "time_per_iteration": 2.5261754989624023 + }, + { + "auxiliary_loss_clip": 0.01169729, + "auxiliary_loss_mlp": 0.01043812, + "balance_loss_clip": 1.05321252, + "balance_loss_mlp": 1.03370941, + "epoch": 0.18541453736547828, + "flos": 13581595301760.0, + "grad_norm": 2.0002675676265924, + "language_loss": 0.83764476, + "learning_rate": 3.7520726921295106e-06, + "loss": 0.85978013, + "num_input_tokens_seen": 32766205, + "step": 1542, + "time_per_iteration": 2.537696123123169 + }, + { + "auxiliary_loss_clip": 0.01200029, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.05551124, + "balance_loss_mlp": 1.02770996, + "epoch": 0.18553478025611736, + "flos": 24025424077440.0, + "grad_norm": 2.1540854720962472, + "language_loss": 0.72559702, + "learning_rate": 3.751696903998042e-06, + "loss": 0.74797213, + "num_input_tokens_seen": 32784840, + "step": 1543, + "time_per_iteration": 2.530993938446045 + }, + { + "auxiliary_loss_clip": 0.01202769, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.06119204, + "balance_loss_mlp": 1.02571845, + "epoch": 0.18565502314675644, + "flos": 25885165720320.0, + "grad_norm": 1.8648186820523522, + "language_loss": 0.70149606, + "learning_rate": 3.7513208501355456e-06, + "loss": 0.72387761, + "num_input_tokens_seen": 32805945, + "step": 1544, + "time_per_iteration": 2.5216071605682373 + }, + { + "auxiliary_loss_clip": 0.01184486, + "auxiliary_loss_mlp": 0.01036935, + "balance_loss_clip": 1.05612588, + "balance_loss_mlp": 1.027601, + "epoch": 0.18577526603739553, + "flos": 19610063631360.0, + "grad_norm": 1.9026032211099624, + "language_loss": 0.83359313, + "learning_rate": 3.750944530599069e-06, + "loss": 0.85580736, + "num_input_tokens_seen": 32825515, + "step": 1545, + "time_per_iteration": 2.5397531986236572 + }, + { + "auxiliary_loss_clip": 0.01212013, + "auxiliary_loss_mlp": 0.00765541, + "balance_loss_clip": 1.06248677, + "balance_loss_mlp": 1.00149095, + "epoch": 0.18589550892803464, + "flos": 18474891137280.0, + "grad_norm": 2.4093407187983225, + "language_loss": 0.80668128, + "learning_rate": 3.7505679454456992e-06, + "loss": 0.82645679, + "num_input_tokens_seen": 32842125, + "step": 1546, + "time_per_iteration": 3.226943254470825 + }, + { + "auxiliary_loss_clip": 0.01123606, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.04883432, + "balance_loss_mlp": 1.02495944, + "epoch": 0.18601575181867372, + "flos": 23549966726400.0, + "grad_norm": 1.9983446963873355, + "language_loss": 0.69847667, + "learning_rate": 3.750191094732564e-06, + "loss": 0.72006321, + "num_input_tokens_seen": 32862990, + "step": 1547, + "time_per_iteration": 3.718635320663452 + }, + { + "auxiliary_loss_clip": 0.01124866, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.04918528, + "balance_loss_mlp": 1.02711463, + "epoch": 0.1861359947093128, + "flos": 26360192108160.0, + "grad_norm": 1.8191058941031168, + "language_loss": 0.75083649, + "learning_rate": 3.7498139785168313e-06, + "loss": 0.77245122, + "num_input_tokens_seen": 32883595, + "step": 1548, + "time_per_iteration": 3.481144428253174 + }, + { + "auxiliary_loss_clip": 0.01202968, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.06250334, + "balance_loss_mlp": 1.02975488, + "epoch": 0.1862562375999519, + "flos": 23331198942720.0, + "grad_norm": 1.8334972983197446, + "language_loss": 0.77460778, + "learning_rate": 3.749436596855709e-06, + "loss": 0.79703778, + "num_input_tokens_seen": 32902895, + "step": 1549, + "time_per_iteration": 2.5085291862487793 + }, + { + "auxiliary_loss_clip": 0.0119827, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.05699182, + "balance_loss_mlp": 1.0219748, + "epoch": 0.186376480490591, + "flos": 16648222942080.0, + "grad_norm": 2.4494272208097, + "language_loss": 0.90518767, + "learning_rate": 3.749058949806446e-06, + "loss": 0.92749357, + "num_input_tokens_seen": 32919620, + "step": 1550, + "time_per_iteration": 2.480605363845825 + }, + { + "auxiliary_loss_clip": 0.0120421, + "auxiliary_loss_mlp": 0.01031887, + "balance_loss_clip": 1.05871773, + "balance_loss_mlp": 1.02238619, + "epoch": 0.18649672338123008, + "flos": 21468656039040.0, + "grad_norm": 1.570947577975064, + "language_loss": 0.84367442, + "learning_rate": 3.748681037426331e-06, + "loss": 0.8660354, + "num_input_tokens_seen": 32938830, + "step": 1551, + "time_per_iteration": 2.526944398880005 + }, + { + "auxiliary_loss_clip": 0.01221474, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.06297278, + "balance_loss_mlp": 1.03006029, + "epoch": 0.1866169662718692, + "flos": 12312728386560.0, + "grad_norm": 2.2582317905945386, + "language_loss": 0.91802633, + "learning_rate": 3.7483028597726936e-06, + "loss": 0.94063556, + "num_input_tokens_seen": 32955600, + "step": 1552, + "time_per_iteration": 2.4482104778289795 + }, + { + "auxiliary_loss_clip": 0.01172441, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.05642521, + "balance_loss_mlp": 1.02983928, + "epoch": 0.18673720916250827, + "flos": 23581280407680.0, + "grad_norm": 2.5686354009574552, + "language_loss": 0.62850809, + "learning_rate": 3.7479244169029017e-06, + "loss": 0.65063143, + "num_input_tokens_seen": 32975390, + "step": 1553, + "time_per_iteration": 2.60425066947937 + }, + { + "auxiliary_loss_clip": 0.01205993, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.05682731, + "balance_loss_mlp": 1.01936948, + "epoch": 0.18685745205314735, + "flos": 19718370115200.0, + "grad_norm": 3.81416848843719, + "language_loss": 0.73412538, + "learning_rate": 3.7475457088743658e-06, + "loss": 0.75647146, + "num_input_tokens_seen": 32992640, + "step": 1554, + "time_per_iteration": 2.513031244277954 + }, + { + "auxiliary_loss_clip": 0.01181296, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.05780029, + "balance_loss_mlp": 1.02874887, + "epoch": 0.18697769494378644, + "flos": 34204123589760.0, + "grad_norm": 2.08819388970875, + "language_loss": 0.74445719, + "learning_rate": 3.7471667357445348e-06, + "loss": 0.76666504, + "num_input_tokens_seen": 33012470, + "step": 1555, + "time_per_iteration": 2.6156959533691406 + }, + { + "auxiliary_loss_clip": 0.01147034, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.05498934, + "balance_loss_mlp": 1.01796448, + "epoch": 0.18709793783442555, + "flos": 34241327101440.0, + "grad_norm": 1.8715351199422372, + "language_loss": 0.72317046, + "learning_rate": 3.7467874975709e-06, + "loss": 0.74491203, + "num_input_tokens_seen": 33033275, + "step": 1556, + "time_per_iteration": 2.774231433868408 + }, + { + "auxiliary_loss_clip": 0.01210674, + "auxiliary_loss_mlp": 0.01044213, + "balance_loss_clip": 1.06219125, + "balance_loss_mlp": 1.03426504, + "epoch": 0.18721818072506463, + "flos": 40734550529280.0, + "grad_norm": 1.9946060261181777, + "language_loss": 0.77806765, + "learning_rate": 3.7464079944109904e-06, + "loss": 0.8006165, + "num_input_tokens_seen": 33055135, + "step": 1557, + "time_per_iteration": 2.656710624694824 + }, + { + "auxiliary_loss_clip": 0.01176867, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.05627906, + "balance_loss_mlp": 1.02304721, + "epoch": 0.18733842361570371, + "flos": 22157386392960.0, + "grad_norm": 1.8940492322244606, + "language_loss": 0.77750742, + "learning_rate": 3.746028226322376e-06, + "loss": 0.79959977, + "num_input_tokens_seen": 33071015, + "step": 1558, + "time_per_iteration": 2.5436840057373047 + }, + { + "auxiliary_loss_clip": 0.01186655, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.05809438, + "balance_loss_mlp": 1.02285719, + "epoch": 0.18745866650634282, + "flos": 18914940656640.0, + "grad_norm": 1.691101638614279, + "language_loss": 0.75481844, + "learning_rate": 3.745648193362669e-06, + "loss": 0.77700734, + "num_input_tokens_seen": 33090370, + "step": 1559, + "time_per_iteration": 2.5269317626953125 + }, + { + "auxiliary_loss_clip": 0.01191393, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.05902386, + "balance_loss_mlp": 1.02544856, + "epoch": 0.1875789093969819, + "flos": 19314626267520.0, + "grad_norm": 1.990582355898355, + "language_loss": 0.72553068, + "learning_rate": 3.745267895589518e-06, + "loss": 0.74779111, + "num_input_tokens_seen": 33108910, + "step": 1560, + "time_per_iteration": 2.5093231201171875 + }, + { + "auxiliary_loss_clip": 0.0119095, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.06006432, + "balance_loss_mlp": 1.02597272, + "epoch": 0.187699152287621, + "flos": 17018965169280.0, + "grad_norm": 1.9053093804656958, + "language_loss": 0.82285178, + "learning_rate": 3.7448873330606154e-06, + "loss": 0.84511834, + "num_input_tokens_seen": 33126680, + "step": 1561, + "time_per_iteration": 2.5056815147399902 + }, + { + "auxiliary_loss_clip": 0.01169884, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.05939007, + "balance_loss_mlp": 1.02529597, + "epoch": 0.18781939517826007, + "flos": 22346384780160.0, + "grad_norm": 2.055608567219313, + "language_loss": 0.87481058, + "learning_rate": 3.7445065058336914e-06, + "loss": 0.89686364, + "num_input_tokens_seen": 33145550, + "step": 1562, + "time_per_iteration": 2.571474313735962 + }, + { + "auxiliary_loss_clip": 0.01146731, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.05012631, + "balance_loss_mlp": 1.02114391, + "epoch": 0.18793963806889918, + "flos": 14611478054400.0, + "grad_norm": 1.7780072356699181, + "language_loss": 0.86268795, + "learning_rate": 3.7441254139665176e-06, + "loss": 0.88445973, + "num_input_tokens_seen": 33161735, + "step": 1563, + "time_per_iteration": 2.5689122676849365 + }, + { + "auxiliary_loss_clip": 0.01220112, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.06428432, + "balance_loss_mlp": 1.02958822, + "epoch": 0.18805988095953827, + "flos": 17457075354240.0, + "grad_norm": 1.889981719556016, + "language_loss": 0.82513249, + "learning_rate": 3.743744057516905e-06, + "loss": 0.84771991, + "num_input_tokens_seen": 33179795, + "step": 1564, + "time_per_iteration": 2.4595906734466553 + }, + { + "auxiliary_loss_clip": 0.01160012, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.05430913, + "balance_loss_mlp": 1.02976871, + "epoch": 0.18818012385017735, + "flos": 15043877976960.0, + "grad_norm": 2.6625675410773963, + "language_loss": 0.87648696, + "learning_rate": 3.743362436542706e-06, + "loss": 0.89848924, + "num_input_tokens_seen": 33194485, + "step": 1565, + "time_per_iteration": 2.594287633895874 + }, + { + "auxiliary_loss_clip": 0.01216345, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.06055367, + "balance_loss_mlp": 1.02247405, + "epoch": 0.18830036674081646, + "flos": 47551975136640.0, + "grad_norm": 1.816273244239361, + "language_loss": 0.76638985, + "learning_rate": 3.7429805511018115e-06, + "loss": 0.78887045, + "num_input_tokens_seen": 33216145, + "step": 1566, + "time_per_iteration": 2.6811671257019043 + }, + { + "auxiliary_loss_clip": 0.01170264, + "auxiliary_loss_mlp": 0.00765555, + "balance_loss_clip": 1.05768275, + "balance_loss_mlp": 1.00140524, + "epoch": 0.18842060963145554, + "flos": 30044626698240.0, + "grad_norm": 1.7407027727474496, + "language_loss": 0.77963156, + "learning_rate": 3.7425984012521524e-06, + "loss": 0.79898977, + "num_input_tokens_seen": 33236345, + "step": 1567, + "time_per_iteration": 3.412440061569214 + }, + { + "auxiliary_loss_clip": 0.01082143, + "auxiliary_loss_mlp": 0.00755266, + "balance_loss_clip": 1.02781129, + "balance_loss_mlp": 0.99990445, + "epoch": 0.18854085252209463, + "flos": 70318372625280.0, + "grad_norm": 0.7348696941141475, + "language_loss": 0.60379803, + "learning_rate": 3.7422159870517025e-06, + "loss": 0.62217212, + "num_input_tokens_seen": 33301600, + "step": 1568, + "time_per_iteration": 3.1371662616729736 + }, + { + "auxiliary_loss_clip": 0.01185823, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.05699897, + "balance_loss_mlp": 1.02343059, + "epoch": 0.1886610954127337, + "flos": 21289318410240.0, + "grad_norm": 1.5667916484034317, + "language_loss": 0.7874403, + "learning_rate": 3.7418333085584717e-06, + "loss": 0.80962938, + "num_input_tokens_seen": 33322785, + "step": 1569, + "time_per_iteration": 2.583754539489746 + }, + { + "auxiliary_loss_clip": 0.01176572, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.05920696, + "balance_loss_mlp": 1.02469969, + "epoch": 0.18878133830337282, + "flos": 17266819991040.0, + "grad_norm": 2.183867635388492, + "language_loss": 0.90675151, + "learning_rate": 3.7414503658305128e-06, + "loss": 0.92886305, + "num_input_tokens_seen": 33340020, + "step": 1570, + "time_per_iteration": 2.550875186920166 + }, + { + "auxiliary_loss_clip": 0.01162559, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.05030751, + "balance_loss_mlp": 1.02482903, + "epoch": 0.1889015811940119, + "flos": 25775207210880.0, + "grad_norm": 2.513363555119363, + "language_loss": 0.77377975, + "learning_rate": 3.7410671589259185e-06, + "loss": 0.7957477, + "num_input_tokens_seen": 33358620, + "step": 1571, + "time_per_iteration": 2.6687686443328857 + }, + { + "auxiliary_loss_clip": 0.01221091, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.0634793, + "balance_loss_mlp": 1.02565145, + "epoch": 0.18902182408465099, + "flos": 21032197879680.0, + "grad_norm": 2.0974020198640164, + "language_loss": 0.79677409, + "learning_rate": 3.7406836879028205e-06, + "loss": 0.81934303, + "num_input_tokens_seen": 33378845, + "step": 1572, + "time_per_iteration": 3.241908073425293 + }, + { + "auxiliary_loss_clip": 0.01204745, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.06179833, + "balance_loss_mlp": 1.02382195, + "epoch": 0.1891420669752901, + "flos": 22272121411200.0, + "grad_norm": 2.877257024225958, + "language_loss": 0.76615435, + "learning_rate": 3.7402999528193907e-06, + "loss": 0.78853822, + "num_input_tokens_seen": 33398345, + "step": 1573, + "time_per_iteration": 2.522653818130493 + }, + { + "auxiliary_loss_clip": 0.01159945, + "auxiliary_loss_mlp": 0.00765082, + "balance_loss_clip": 1.05522072, + "balance_loss_mlp": 1.00125611, + "epoch": 0.18926230986592918, + "flos": 22017802141440.0, + "grad_norm": 2.5088768662122134, + "language_loss": 0.85467756, + "learning_rate": 3.739915953733842e-06, + "loss": 0.87392783, + "num_input_tokens_seen": 33416390, + "step": 1574, + "time_per_iteration": 4.194178819656372 + }, + { + "auxiliary_loss_clip": 0.01218979, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.06255531, + "balance_loss_mlp": 1.0234108, + "epoch": 0.18938255275656826, + "flos": 24462672336000.0, + "grad_norm": 1.5902777594996012, + "language_loss": 0.82046533, + "learning_rate": 3.7395316907044264e-06, + "loss": 0.84298474, + "num_input_tokens_seen": 33437175, + "step": 1575, + "time_per_iteration": 2.5510268211364746 + }, + { + "auxiliary_loss_clip": 0.01203771, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.05937088, + "balance_loss_mlp": 1.0265733, + "epoch": 0.18950279564720737, + "flos": 24427049022720.0, + "grad_norm": 1.5638724528012768, + "language_loss": 0.79413855, + "learning_rate": 3.7391471637894364e-06, + "loss": 0.81653738, + "num_input_tokens_seen": 33459440, + "step": 1576, + "time_per_iteration": 2.5726404190063477 + }, + { + "auxiliary_loss_clip": 0.01177521, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.05484235, + "balance_loss_mlp": 1.02683473, + "epoch": 0.18962303853784646, + "flos": 19756291898880.0, + "grad_norm": 1.8238264378690268, + "language_loss": 0.85006297, + "learning_rate": 3.738762373047205e-06, + "loss": 0.87220097, + "num_input_tokens_seen": 33479360, + "step": 1577, + "time_per_iteration": 2.580717086791992 + }, + { + "auxiliary_loss_clip": 0.01176726, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.05898416, + "balance_loss_mlp": 1.02662933, + "epoch": 0.18974328142848554, + "flos": 21032054225280.0, + "grad_norm": 1.6618809774010064, + "language_loss": 0.83225596, + "learning_rate": 3.738377318536103e-06, + "loss": 0.85438478, + "num_input_tokens_seen": 33499245, + "step": 1578, + "time_per_iteration": 2.584155559539795 + }, + { + "auxiliary_loss_clip": 0.01216222, + "auxiliary_loss_mlp": 0.01035089, + "balance_loss_clip": 1.06367838, + "balance_loss_mlp": 1.02635121, + "epoch": 0.18986352431912462, + "flos": 12966122736000.0, + "grad_norm": 1.9758848388073804, + "language_loss": 0.70866024, + "learning_rate": 3.7379920003145447e-06, + "loss": 0.73117328, + "num_input_tokens_seen": 33513520, + "step": 1579, + "time_per_iteration": 2.4178686141967773 + }, + { + "auxiliary_loss_clip": 0.01182925, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.05954337, + "balance_loss_mlp": 1.02701235, + "epoch": 0.18998376720976373, + "flos": 23767908497280.0, + "grad_norm": 2.258022130184419, + "language_loss": 0.837309, + "learning_rate": 3.7376064184409817e-06, + "loss": 0.85951215, + "num_input_tokens_seen": 33533100, + "step": 1580, + "time_per_iteration": 2.534109592437744 + }, + { + "auxiliary_loss_clip": 0.01188387, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.05996442, + "balance_loss_mlp": 1.02377629, + "epoch": 0.19010401010040281, + "flos": 22966023323520.0, + "grad_norm": 1.4164268897440808, + "language_loss": 0.87074792, + "learning_rate": 3.7372205729739063e-06, + "loss": 0.89297158, + "num_input_tokens_seen": 33554915, + "step": 1581, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01206932, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.06021559, + "balance_loss_mlp": 1.02000725, + "epoch": 0.1902242529910419, + "flos": 19135647774720.0, + "grad_norm": 2.822840903334306, + "language_loss": 0.72001266, + "learning_rate": 3.7368344639718514e-06, + "loss": 0.74238664, + "num_input_tokens_seen": 33572850, + "step": 1582, + "time_per_iteration": 2.4754326343536377 + }, + { + "auxiliary_loss_clip": 0.01205246, + "auxiliary_loss_mlp": 0.01041084, + "balance_loss_clip": 1.06047106, + "balance_loss_mlp": 1.03226805, + "epoch": 0.190344495881681, + "flos": 25483935824640.0, + "grad_norm": 1.6753643999543122, + "language_loss": 0.80642939, + "learning_rate": 3.7364480914933895e-06, + "loss": 0.82889271, + "num_input_tokens_seen": 33593090, + "step": 1583, + "time_per_iteration": 2.531172513961792 + }, + { + "auxiliary_loss_clip": 0.01156561, + "auxiliary_loss_mlp": 0.00765206, + "balance_loss_clip": 1.05529881, + "balance_loss_mlp": 1.00116158, + "epoch": 0.1904647387723201, + "flos": 26792843425920.0, + "grad_norm": 1.7764484770584368, + "language_loss": 0.80815411, + "learning_rate": 3.7360614555971325e-06, + "loss": 0.82737184, + "num_input_tokens_seen": 33612745, + "step": 1584, + "time_per_iteration": 2.633847236633301 + }, + { + "auxiliary_loss_clip": 0.01202965, + "auxiliary_loss_mlp": 0.00764723, + "balance_loss_clip": 1.06017041, + "balance_loss_mlp": 1.00108933, + "epoch": 0.19058498166295917, + "flos": 23987753688960.0, + "grad_norm": 1.9150613475059173, + "language_loss": 0.84991652, + "learning_rate": 3.735674556341733e-06, + "loss": 0.86959338, + "num_input_tokens_seen": 33632360, + "step": 1585, + "time_per_iteration": 2.5255250930786133 + }, + { + "auxiliary_loss_clip": 0.01188029, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.06202781, + "balance_loss_mlp": 1.02861571, + "epoch": 0.19070522455359826, + "flos": 28293299280000.0, + "grad_norm": 2.195488658542415, + "language_loss": 0.82918012, + "learning_rate": 3.7352873937858835e-06, + "loss": 0.85144138, + "num_input_tokens_seen": 33653895, + "step": 1586, + "time_per_iteration": 2.5863850116729736 + }, + { + "auxiliary_loss_clip": 0.01168593, + "auxiliary_loss_mlp": 0.00765267, + "balance_loss_clip": 1.05731654, + "balance_loss_mlp": 1.00098729, + "epoch": 0.19082546744423737, + "flos": 25660220797440.0, + "grad_norm": 1.8382884467899543, + "language_loss": 0.71705234, + "learning_rate": 3.734899967988316e-06, + "loss": 0.73639095, + "num_input_tokens_seen": 33672075, + "step": 1587, + "time_per_iteration": 2.5889408588409424 + }, + { + "auxiliary_loss_clip": 0.0116527, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.05372965, + "balance_loss_mlp": 1.02186811, + "epoch": 0.19094571033487645, + "flos": 19719483436800.0, + "grad_norm": 1.7617970397877933, + "language_loss": 0.84209192, + "learning_rate": 3.7345122790078026e-06, + "loss": 0.86405796, + "num_input_tokens_seen": 33689640, + "step": 1588, + "time_per_iteration": 2.548868417739868 + }, + { + "auxiliary_loss_clip": 0.0120359, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.06087148, + "balance_loss_mlp": 1.0238055, + "epoch": 0.19106595322551553, + "flos": 21616320850560.0, + "grad_norm": 2.9013520651165643, + "language_loss": 0.93263906, + "learning_rate": 3.7341243269031556e-06, + "loss": 0.95501864, + "num_input_tokens_seen": 33708630, + "step": 1589, + "time_per_iteration": 2.487123966217041 + }, + { + "auxiliary_loss_clip": 0.01179851, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.05791235, + "balance_loss_mlp": 1.02462196, + "epoch": 0.19118619611615464, + "flos": 29896890059520.0, + "grad_norm": 1.742265582404153, + "language_loss": 0.77464372, + "learning_rate": 3.7337361117332275e-06, + "loss": 0.79677957, + "num_input_tokens_seen": 33730370, + "step": 1590, + "time_per_iteration": 2.59507155418396 + }, + { + "auxiliary_loss_clip": 0.01172771, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.05422902, + "balance_loss_mlp": 1.02107489, + "epoch": 0.19130643900679373, + "flos": 17273428093440.0, + "grad_norm": 1.987391806197037, + "language_loss": 0.77362764, + "learning_rate": 3.7333476335569087e-06, + "loss": 0.79565418, + "num_input_tokens_seen": 33748370, + "step": 1591, + "time_per_iteration": 2.5393762588500977 + }, + { + "auxiliary_loss_clip": 0.01188525, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.05983901, + "balance_loss_mlp": 1.02528167, + "epoch": 0.1914266818974328, + "flos": 24826339584000.0, + "grad_norm": 2.221837379579082, + "language_loss": 0.67002207, + "learning_rate": 3.7329588924331325e-06, + "loss": 0.69226372, + "num_input_tokens_seen": 33769575, + "step": 1592, + "time_per_iteration": 2.57541561126709 + }, + { + "auxiliary_loss_clip": 0.01164584, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.05271208, + "balance_loss_mlp": 1.02580631, + "epoch": 0.1915469247880719, + "flos": 18952467390720.0, + "grad_norm": 1.6975999779184505, + "language_loss": 0.82252264, + "learning_rate": 3.732569888420871e-06, + "loss": 0.84452057, + "num_input_tokens_seen": 33789110, + "step": 1593, + "time_per_iteration": 3.3779399394989014 + }, + { + "auxiliary_loss_clip": 0.01219186, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.06030118, + "balance_loss_mlp": 1.02360141, + "epoch": 0.191667167678711, + "flos": 21032952065280.0, + "grad_norm": 3.3382731143799758, + "language_loss": 0.82523894, + "learning_rate": 3.732180621579134e-06, + "loss": 0.84777123, + "num_input_tokens_seen": 33808325, + "step": 1594, + "time_per_iteration": 2.4570164680480957 + }, + { + "auxiliary_loss_clip": 0.01184581, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.05961871, + "balance_loss_mlp": 1.02532482, + "epoch": 0.1917874105693501, + "flos": 34237663914240.0, + "grad_norm": 4.589277036258129, + "language_loss": 0.81210053, + "learning_rate": 3.7317910919669745e-06, + "loss": 0.8342998, + "num_input_tokens_seen": 33829520, + "step": 1595, + "time_per_iteration": 2.6728694438934326 + }, + { + "auxiliary_loss_clip": 0.01202828, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.06096947, + "balance_loss_mlp": 1.03000343, + "epoch": 0.19190765345998917, + "flos": 23550613171200.0, + "grad_norm": 2.118972675721372, + "language_loss": 0.76637793, + "learning_rate": 3.7314012996434826e-06, + "loss": 0.78880751, + "num_input_tokens_seen": 33848250, + "step": 1596, + "time_per_iteration": 2.498216390609741 + }, + { + "auxiliary_loss_clip": 0.01190154, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.05953336, + "balance_loss_mlp": 1.02100563, + "epoch": 0.19202789635062828, + "flos": 19861330245120.0, + "grad_norm": 1.9762185883592378, + "language_loss": 0.81549913, + "learning_rate": 3.7310112446677907e-06, + "loss": 0.83770967, + "num_input_tokens_seen": 33866160, + "step": 1597, + "time_per_iteration": 2.504812479019165 + }, + { + "auxiliary_loss_clip": 0.01222459, + "auxiliary_loss_mlp": 0.01030574, + "balance_loss_clip": 1.06431508, + "balance_loss_mlp": 1.020805, + "epoch": 0.19214813924126736, + "flos": 20922957642240.0, + "grad_norm": 3.6050880450292677, + "language_loss": 0.69361144, + "learning_rate": 3.7306209270990695e-06, + "loss": 0.71614176, + "num_input_tokens_seen": 33884165, + "step": 1598, + "time_per_iteration": 2.488412857055664 + }, + { + "auxiliary_loss_clip": 0.01188485, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.05908775, + "balance_loss_mlp": 1.03145051, + "epoch": 0.19226838213190645, + "flos": 26359725231360.0, + "grad_norm": 1.9539046762315475, + "language_loss": 0.86761081, + "learning_rate": 3.7302303469965292e-06, + "loss": 0.88990283, + "num_input_tokens_seen": 33903705, + "step": 1599, + "time_per_iteration": 3.3601696491241455 + }, + { + "auxiliary_loss_clip": 0.0120324, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.06108558, + "balance_loss_mlp": 1.0325942, + "epoch": 0.19238862502254553, + "flos": 20850525866880.0, + "grad_norm": 5.035027889308814, + "language_loss": 0.709975, + "learning_rate": 3.7298395044194206e-06, + "loss": 0.73242855, + "num_input_tokens_seen": 33922515, + "step": 1600, + "time_per_iteration": 3.328348159790039 + }, + { + "auxiliary_loss_clip": 0.01222932, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.0658021, + "balance_loss_mlp": 1.0252068, + "epoch": 0.19250886791318464, + "flos": 21726063878400.0, + "grad_norm": 2.5307715227204817, + "language_loss": 0.94272757, + "learning_rate": 3.7294483994270356e-06, + "loss": 0.96530461, + "num_input_tokens_seen": 33940840, + "step": 1601, + "time_per_iteration": 3.2153122425079346 + }, + { + "auxiliary_loss_clip": 0.01146556, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.05223823, + "balance_loss_mlp": 1.02334726, + "epoch": 0.19262911080382372, + "flos": 23367827836800.0, + "grad_norm": 2.1602791193047346, + "language_loss": 0.78748274, + "learning_rate": 3.7290570320787033e-06, + "loss": 0.80926508, + "num_input_tokens_seen": 33960420, + "step": 1602, + "time_per_iteration": 2.5850577354431152 + }, + { + "auxiliary_loss_clip": 0.01203137, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.06198585, + "balance_loss_mlp": 1.02238417, + "epoch": 0.1927493536944628, + "flos": 21943502858880.0, + "grad_norm": 2.260678450619943, + "language_loss": 0.71356058, + "learning_rate": 3.728665402433793e-06, + "loss": 0.73591101, + "num_input_tokens_seen": 33978990, + "step": 1603, + "time_per_iteration": 2.5086870193481445 + }, + { + "auxiliary_loss_clip": 0.01192163, + "auxiliary_loss_mlp": 0.01034638, + "balance_loss_clip": 1.06267428, + "balance_loss_mlp": 1.02552414, + "epoch": 0.19286959658510192, + "flos": 16545590807040.0, + "grad_norm": 2.3352052621092008, + "language_loss": 0.86060452, + "learning_rate": 3.7282735105517164e-06, + "loss": 0.88287258, + "num_input_tokens_seen": 33997115, + "step": 1604, + "time_per_iteration": 2.497164249420166 + }, + { + "auxiliary_loss_clip": 0.01165361, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.05348873, + "balance_loss_mlp": 1.02702296, + "epoch": 0.192989839475741, + "flos": 21616967295360.0, + "grad_norm": 1.9861077245212593, + "language_loss": 0.67265725, + "learning_rate": 3.727881356491922e-06, + "loss": 0.69467771, + "num_input_tokens_seen": 34015525, + "step": 1605, + "time_per_iteration": 2.606354236602783 + }, + { + "auxiliary_loss_clip": 0.01219973, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.06457198, + "balance_loss_mlp": 1.02789354, + "epoch": 0.19311008236638008, + "flos": 19281516906240.0, + "grad_norm": 1.8079207738179592, + "language_loss": 0.75772434, + "learning_rate": 3.7274889403139002e-06, + "loss": 0.78028947, + "num_input_tokens_seen": 34033150, + "step": 1606, + "time_per_iteration": 2.4447648525238037 + }, + { + "auxiliary_loss_clip": 0.01157561, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.05782604, + "balance_loss_mlp": 1.0236131, + "epoch": 0.1932303252570192, + "flos": 28652369587200.0, + "grad_norm": 2.4854466326576645, + "language_loss": 0.78287339, + "learning_rate": 3.727096262077179e-06, + "loss": 0.80477846, + "num_input_tokens_seen": 34052145, + "step": 1607, + "time_per_iteration": 2.6555755138397217 + }, + { + "auxiliary_loss_clip": 0.01205283, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.06171179, + "balance_loss_mlp": 1.02213192, + "epoch": 0.19335056814765827, + "flos": 18368990864640.0, + "grad_norm": 1.712899263160362, + "language_loss": 0.85260642, + "learning_rate": 3.7267033218413285e-06, + "loss": 0.87497365, + "num_input_tokens_seen": 34069940, + "step": 1608, + "time_per_iteration": 2.485337734222412 + }, + { + "auxiliary_loss_clip": 0.01144435, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.04949069, + "balance_loss_mlp": 1.02715993, + "epoch": 0.19347081103829736, + "flos": 13260877741440.0, + "grad_norm": 2.3274551957348333, + "language_loss": 0.81559098, + "learning_rate": 3.726310119665957e-06, + "loss": 0.83741516, + "num_input_tokens_seen": 34086275, + "step": 1609, + "time_per_iteration": 2.5958425998687744 + }, + { + "auxiliary_loss_clip": 0.01203492, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.05945301, + "balance_loss_mlp": 1.02264357, + "epoch": 0.19359105392893644, + "flos": 20300122788480.0, + "grad_norm": 6.797871910514251, + "language_loss": 0.8529979, + "learning_rate": 3.725916655610713e-06, + "loss": 0.87535512, + "num_input_tokens_seen": 34105605, + "step": 1610, + "time_per_iteration": 2.50962233543396 + }, + { + "auxiliary_loss_clip": 0.01179336, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.05387163, + "balance_loss_mlp": 1.02072346, + "epoch": 0.19371129681957555, + "flos": 20484596062080.0, + "grad_norm": 2.271893400566545, + "language_loss": 0.756423, + "learning_rate": 3.725522929735284e-06, + "loss": 0.77852488, + "num_input_tokens_seen": 34122540, + "step": 1611, + "time_per_iteration": 2.4995856285095215 + }, + { + "auxiliary_loss_clip": 0.01195033, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.05754519, + "balance_loss_mlp": 1.02136147, + "epoch": 0.19383153971021463, + "flos": 30445497457920.0, + "grad_norm": 3.9646823815996592, + "language_loss": 0.74336874, + "learning_rate": 3.725128942099399e-06, + "loss": 0.76563156, + "num_input_tokens_seen": 34142940, + "step": 1612, + "time_per_iteration": 2.5952932834625244 + }, + { + "auxiliary_loss_clip": 0.01176847, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.0547688, + "balance_loss_mlp": 1.02373433, + "epoch": 0.19395178260085372, + "flos": 24569937325440.0, + "grad_norm": 3.5108690578685433, + "language_loss": 0.79512978, + "learning_rate": 3.7247346927628245e-06, + "loss": 0.81723213, + "num_input_tokens_seen": 34162875, + "step": 1613, + "time_per_iteration": 2.5632119178771973 + }, + { + "auxiliary_loss_clip": 0.01183374, + "auxiliary_loss_mlp": 0.00765144, + "balance_loss_clip": 1.05575013, + "balance_loss_mlp": 1.00093484, + "epoch": 0.19407202549149283, + "flos": 28950608211840.0, + "grad_norm": 1.8605008041969282, + "language_loss": 0.79384148, + "learning_rate": 3.7243401817853694e-06, + "loss": 0.81332666, + "num_input_tokens_seen": 34183565, + "step": 1614, + "time_per_iteration": 2.5685200691223145 + }, + { + "auxiliary_loss_clip": 0.01195809, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.05703545, + "balance_loss_mlp": 1.02399826, + "epoch": 0.1941922683821319, + "flos": 18004497603840.0, + "grad_norm": 1.9572675795824275, + "language_loss": 0.71726632, + "learning_rate": 3.723945409226879e-06, + "loss": 0.73955715, + "num_input_tokens_seen": 34202055, + "step": 1615, + "time_per_iteration": 2.445333242416382 + }, + { + "auxiliary_loss_clip": 0.01202986, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.05983257, + "balance_loss_mlp": 1.03220236, + "epoch": 0.194312511272771, + "flos": 9720337034880.0, + "grad_norm": 2.569418678549781, + "language_loss": 0.80127764, + "learning_rate": 3.723550375147241e-06, + "loss": 0.82372916, + "num_input_tokens_seen": 34216830, + "step": 1616, + "time_per_iteration": 2.473280668258667 + }, + { + "auxiliary_loss_clip": 0.01159501, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.05108607, + "balance_loss_mlp": 1.02555561, + "epoch": 0.19443275416341008, + "flos": 27016208150400.0, + "grad_norm": 1.6555038259686654, + "language_loss": 0.79878938, + "learning_rate": 3.7231550796063816e-06, + "loss": 0.82074201, + "num_input_tokens_seen": 34236840, + "step": 1617, + "time_per_iteration": 2.594310760498047 + }, + { + "auxiliary_loss_clip": 0.01195926, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.06091058, + "balance_loss_mlp": 1.02860737, + "epoch": 0.1945529970540492, + "flos": 15846625077120.0, + "grad_norm": 2.2155233497930267, + "language_loss": 0.64881492, + "learning_rate": 3.722759522664266e-06, + "loss": 0.6711635, + "num_input_tokens_seen": 34254140, + "step": 1618, + "time_per_iteration": 2.5026543140411377 + }, + { + "auxiliary_loss_clip": 0.01159288, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.05464172, + "balance_loss_mlp": 1.01980817, + "epoch": 0.19467323994468827, + "flos": 19314985403520.0, + "grad_norm": 2.094113840125367, + "language_loss": 0.81364161, + "learning_rate": 3.7223637043809016e-06, + "loss": 0.83553147, + "num_input_tokens_seen": 34273120, + "step": 1619, + "time_per_iteration": 3.3568055629730225 + }, + { + "auxiliary_loss_clip": 0.01177042, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.05788827, + "balance_loss_mlp": 1.03059053, + "epoch": 0.19479348283532735, + "flos": 24133227770880.0, + "grad_norm": 1.7939700519857602, + "language_loss": 0.86837673, + "learning_rate": 3.7219676248163322e-06, + "loss": 0.89054513, + "num_input_tokens_seen": 34290285, + "step": 1620, + "time_per_iteration": 2.550503730773926 + }, + { + "auxiliary_loss_clip": 0.01209746, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.06290007, + "balance_loss_mlp": 1.02362609, + "epoch": 0.19491372572596646, + "flos": 25775638174080.0, + "grad_norm": 1.9266301918596251, + "language_loss": 0.93155599, + "learning_rate": 3.721571284030643e-06, + "loss": 0.95399183, + "num_input_tokens_seen": 34310095, + "step": 1621, + "time_per_iteration": 2.527374744415283 + }, + { + "auxiliary_loss_clip": 0.01207716, + "auxiliary_loss_mlp": 0.01026182, + "balance_loss_clip": 1.06077635, + "balance_loss_mlp": 1.01655555, + "epoch": 0.19503396861660555, + "flos": 19645220067840.0, + "grad_norm": 2.611281306459111, + "language_loss": 0.79294729, + "learning_rate": 3.7211746820839587e-06, + "loss": 0.81528628, + "num_input_tokens_seen": 34327190, + "step": 1622, + "time_per_iteration": 2.482006788253784 + }, + { + "auxiliary_loss_clip": 0.01109937, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.04737592, + "balance_loss_mlp": 1.02103972, + "epoch": 0.19515421150724463, + "flos": 21033023892480.0, + "grad_norm": 1.6229701700855272, + "language_loss": 0.80526686, + "learning_rate": 3.7207778190364437e-06, + "loss": 0.82667577, + "num_input_tokens_seen": 34345615, + "step": 1623, + "time_per_iteration": 2.632573366165161 + }, + { + "auxiliary_loss_clip": 0.01130734, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.04932904, + "balance_loss_mlp": 1.02382112, + "epoch": 0.1952744543978837, + "flos": 32961255143040.0, + "grad_norm": 1.6046977402444162, + "language_loss": 0.73752093, + "learning_rate": 3.720380694948302e-06, + "loss": 0.7591635, + "num_input_tokens_seen": 34368500, + "step": 1624, + "time_per_iteration": 2.6935505867004395 + }, + { + "auxiliary_loss_clip": 0.01083639, + "auxiliary_loss_mlp": 0.01007573, + "balance_loss_clip": 1.03001809, + "balance_loss_mlp": 1.00430667, + "epoch": 0.19539469728852282, + "flos": 64044312030720.0, + "grad_norm": 1.0340483389483592, + "language_loss": 0.71240377, + "learning_rate": 3.719983309879777e-06, + "loss": 0.73331594, + "num_input_tokens_seen": 34428280, + "step": 1625, + "time_per_iteration": 3.9505069255828857 + }, + { + "auxiliary_loss_clip": 0.01164246, + "auxiliary_loss_mlp": 0.0104103, + "balance_loss_clip": 1.05362272, + "balance_loss_mlp": 1.03160608, + "epoch": 0.1955149401791619, + "flos": 13370908078080.0, + "grad_norm": 1.9448949063990069, + "language_loss": 0.7744208, + "learning_rate": 3.719585663891151e-06, + "loss": 0.7964735, + "num_input_tokens_seen": 34445815, + "step": 1626, + "time_per_iteration": 3.36405873298645 + }, + { + "auxiliary_loss_clip": 0.0115292, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.05643702, + "balance_loss_mlp": 1.03162766, + "epoch": 0.195635183069801, + "flos": 18728887184640.0, + "grad_norm": 2.104964249852706, + "language_loss": 0.78847951, + "learning_rate": 3.719187757042747e-06, + "loss": 0.81042689, + "num_input_tokens_seen": 34463635, + "step": 1627, + "time_per_iteration": 3.3201677799224854 + }, + { + "auxiliary_loss_clip": 0.01104896, + "auxiliary_loss_mlp": 0.01003333, + "balance_loss_clip": 1.0356195, + "balance_loss_mlp": 1.00038815, + "epoch": 0.1957554259604401, + "flos": 69313952615040.0, + "grad_norm": 0.7295797716334137, + "language_loss": 0.54937702, + "learning_rate": 3.7187895893949275e-06, + "loss": 0.57045931, + "num_input_tokens_seen": 34530105, + "step": 1628, + "time_per_iteration": 3.1895103454589844 + }, + { + "auxiliary_loss_clip": 0.01145832, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.05101264, + "balance_loss_mlp": 1.01543212, + "epoch": 0.19587566885107918, + "flos": 21069257736960.0, + "grad_norm": 2.2061824573171993, + "language_loss": 0.76364571, + "learning_rate": 3.7183911610080937e-06, + "loss": 0.78536224, + "num_input_tokens_seen": 34546970, + "step": 1629, + "time_per_iteration": 2.5896711349487305 + }, + { + "auxiliary_loss_clip": 0.01178498, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.05810952, + "balance_loss_mlp": 1.03204989, + "epoch": 0.19599591174171827, + "flos": 22194661731840.0, + "grad_norm": 2.9276473469108653, + "language_loss": 0.74557209, + "learning_rate": 3.7179924719426872e-06, + "loss": 0.76778662, + "num_input_tokens_seen": 34564865, + "step": 1630, + "time_per_iteration": 2.5846400260925293 + }, + { + "auxiliary_loss_clip": 0.01208447, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.06222546, + "balance_loss_mlp": 1.03079903, + "epoch": 0.19611615463235738, + "flos": 23768375374080.0, + "grad_norm": 2.511109352186563, + "language_loss": 0.76104778, + "learning_rate": 3.7175935222591885e-06, + "loss": 0.78353965, + "num_input_tokens_seen": 34584165, + "step": 1631, + "time_per_iteration": 2.536773204803467 + }, + { + "auxiliary_loss_clip": 0.0119398, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.06454563, + "balance_loss_mlp": 1.02465725, + "epoch": 0.19623639752299646, + "flos": 28618218731520.0, + "grad_norm": 1.8385144209183029, + "language_loss": 0.74802679, + "learning_rate": 3.717194312018118e-06, + "loss": 0.77031207, + "num_input_tokens_seen": 34603150, + "step": 1632, + "time_per_iteration": 2.6672165393829346 + }, + { + "auxiliary_loss_clip": 0.01203381, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.0590229, + "balance_loss_mlp": 1.02780676, + "epoch": 0.19635664041363554, + "flos": 21032700670080.0, + "grad_norm": 2.1660042650376092, + "language_loss": 0.76431143, + "learning_rate": 3.716794841280036e-06, + "loss": 0.78672504, + "num_input_tokens_seen": 34621855, + "step": 1633, + "time_per_iteration": 2.50923752784729 + }, + { + "auxiliary_loss_clip": 0.0121039, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.06045151, + "balance_loss_mlp": 1.02867508, + "epoch": 0.19647688330427462, + "flos": 18879748306560.0, + "grad_norm": 2.0475445430633057, + "language_loss": 0.77557826, + "learning_rate": 3.7163951101055407e-06, + "loss": 0.798069, + "num_input_tokens_seen": 34639915, + "step": 1634, + "time_per_iteration": 2.4987168312072754 + }, + { + "auxiliary_loss_clip": 0.01188349, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.06042349, + "balance_loss_mlp": 1.02673519, + "epoch": 0.19659712619491373, + "flos": 24242503921920.0, + "grad_norm": 1.9019399445160583, + "language_loss": 0.78896469, + "learning_rate": 3.715995118555273e-06, + "loss": 0.81121737, + "num_input_tokens_seen": 34659890, + "step": 1635, + "time_per_iteration": 2.5771420001983643 + }, + { + "auxiliary_loss_clip": 0.01154421, + "auxiliary_loss_mlp": 0.01039155, + "balance_loss_clip": 1.05317211, + "balance_loss_mlp": 1.02812254, + "epoch": 0.19671736908555282, + "flos": 24717422568960.0, + "grad_norm": 2.4529292534742893, + "language_loss": 0.85550624, + "learning_rate": 3.71559486668991e-06, + "loss": 0.877442, + "num_input_tokens_seen": 34678750, + "step": 1636, + "time_per_iteration": 2.63238263130188 + }, + { + "auxiliary_loss_clip": 0.01211982, + "auxiliary_loss_mlp": 0.00765143, + "balance_loss_clip": 1.06431162, + "balance_loss_mlp": 1.00102079, + "epoch": 0.1968376119761919, + "flos": 23842279607040.0, + "grad_norm": 1.6884853463991911, + "language_loss": 0.7729736, + "learning_rate": 3.715194354570169e-06, + "loss": 0.79274487, + "num_input_tokens_seen": 34698755, + "step": 1637, + "time_per_iteration": 2.5613176822662354 + }, + { + "auxiliary_loss_clip": 0.01206988, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.06504273, + "balance_loss_mlp": 1.03295088, + "epoch": 0.196957854866831, + "flos": 18113917409280.0, + "grad_norm": 1.9691690305393585, + "language_loss": 0.83363259, + "learning_rate": 3.714793582256809e-06, + "loss": 0.85612857, + "num_input_tokens_seen": 34715820, + "step": 1638, + "time_per_iteration": 2.583327054977417 + }, + { + "auxiliary_loss_clip": 0.01218352, + "auxiliary_loss_mlp": 0.01036213, + "balance_loss_clip": 1.06208348, + "balance_loss_mlp": 1.02621758, + "epoch": 0.1970780977574701, + "flos": 21653129312640.0, + "grad_norm": 2.6975099761931673, + "language_loss": 0.85075057, + "learning_rate": 3.7143925498106253e-06, + "loss": 0.87329626, + "num_input_tokens_seen": 34734360, + "step": 1639, + "time_per_iteration": 2.4852707386016846 + }, + { + "auxiliary_loss_clip": 0.01189501, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.05576229, + "balance_loss_mlp": 1.02508259, + "epoch": 0.19719834064810918, + "flos": 20811813984000.0, + "grad_norm": 1.9507284402425875, + "language_loss": 0.79450566, + "learning_rate": 3.7139912572924558e-06, + "loss": 0.81675875, + "num_input_tokens_seen": 34753390, + "step": 1640, + "time_per_iteration": 2.5805928707122803 + }, + { + "auxiliary_loss_clip": 0.01201007, + "auxiliary_loss_mlp": 0.01038118, + "balance_loss_clip": 1.05786896, + "balance_loss_mlp": 1.02830696, + "epoch": 0.19731858353874826, + "flos": 23434800744960.0, + "grad_norm": 4.183664415689264, + "language_loss": 0.79915917, + "learning_rate": 3.7135897047631744e-06, + "loss": 0.82155037, + "num_input_tokens_seen": 34771275, + "step": 1641, + "time_per_iteration": 2.5684657096862793 + }, + { + "auxiliary_loss_clip": 0.01192026, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.06010377, + "balance_loss_mlp": 1.02636778, + "epoch": 0.19743882642938737, + "flos": 23988184652160.0, + "grad_norm": 2.4634363500286907, + "language_loss": 0.76119924, + "learning_rate": 3.713187892283698e-06, + "loss": 0.7834866, + "num_input_tokens_seen": 34790885, + "step": 1642, + "time_per_iteration": 2.5714285373687744 + }, + { + "auxiliary_loss_clip": 0.01158354, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.05249333, + "balance_loss_mlp": 1.02901709, + "epoch": 0.19755906932002645, + "flos": 15004340081280.0, + "grad_norm": 2.459635447959844, + "language_loss": 0.87307298, + "learning_rate": 3.71278581991498e-06, + "loss": 0.89504838, + "num_input_tokens_seen": 34806745, + "step": 1643, + "time_per_iteration": 2.5991883277893066 + }, + { + "auxiliary_loss_clip": 0.0118069, + "auxiliary_loss_mlp": 0.00766174, + "balance_loss_clip": 1.06415129, + "balance_loss_mlp": 1.00091124, + "epoch": 0.19767931221066554, + "flos": 19494466686720.0, + "grad_norm": 1.7661277831995152, + "language_loss": 0.79050279, + "learning_rate": 3.712383487718015e-06, + "loss": 0.80997145, + "num_input_tokens_seen": 34824985, + "step": 1644, + "time_per_iteration": 2.565838098526001 + }, + { + "auxiliary_loss_clip": 0.01141436, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.05368018, + "balance_loss_mlp": 1.02492642, + "epoch": 0.19779955510130465, + "flos": 25737895958400.0, + "grad_norm": 1.9384558475600515, + "language_loss": 0.86899418, + "learning_rate": 3.7119808957538365e-06, + "loss": 0.89075506, + "num_input_tokens_seen": 34843980, + "step": 1645, + "time_per_iteration": 2.6641485691070557 + }, + { + "auxiliary_loss_clip": 0.01185913, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.05651164, + "balance_loss_mlp": 1.02508879, + "epoch": 0.19791979799194373, + "flos": 20777699041920.0, + "grad_norm": 8.451098249241808, + "language_loss": 0.80159289, + "learning_rate": 3.711578044083517e-06, + "loss": 0.8238098, + "num_input_tokens_seen": 34860780, + "step": 1646, + "time_per_iteration": 3.3448476791381836 + }, + { + "auxiliary_loss_clip": 0.01192336, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.06008196, + "balance_loss_mlp": 1.03087234, + "epoch": 0.1980400408825828, + "flos": 25589010084480.0, + "grad_norm": 1.9040656801999716, + "language_loss": 0.74541759, + "learning_rate": 3.7111749327681698e-06, + "loss": 0.76775408, + "num_input_tokens_seen": 34880815, + "step": 1647, + "time_per_iteration": 2.5679314136505127 + }, + { + "auxiliary_loss_clip": 0.0121259, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.06656599, + "balance_loss_mlp": 1.02118969, + "epoch": 0.1981602837732219, + "flos": 23513840622720.0, + "grad_norm": 3.0242349250946368, + "language_loss": 0.86099494, + "learning_rate": 3.7107715618689455e-06, + "loss": 0.88342601, + "num_input_tokens_seen": 34899790, + "step": 1648, + "time_per_iteration": 2.5200653076171875 + }, + { + "auxiliary_loss_clip": 0.01202204, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.0612092, + "balance_loss_mlp": 1.02311718, + "epoch": 0.198280526663861, + "flos": 23185365724800.0, + "grad_norm": 1.5420652078259156, + "language_loss": 0.83605528, + "learning_rate": 3.710367931447035e-06, + "loss": 0.85841078, + "num_input_tokens_seen": 34921570, + "step": 1649, + "time_per_iteration": 2.5283823013305664 + }, + { + "auxiliary_loss_clip": 0.01214658, + "auxiliary_loss_mlp": 0.01041979, + "balance_loss_clip": 1.0640049, + "balance_loss_mlp": 1.03121471, + "epoch": 0.1984007695545001, + "flos": 21689470897920.0, + "grad_norm": 2.1688839824402373, + "language_loss": 0.86143672, + "learning_rate": 3.70996404156367e-06, + "loss": 0.88400304, + "num_input_tokens_seen": 34941205, + "step": 1650, + "time_per_iteration": 2.5186758041381836 + }, + { + "auxiliary_loss_clip": 0.01152221, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.05473626, + "balance_loss_mlp": 1.02929068, + "epoch": 0.19852101244513917, + "flos": 36064008887040.0, + "grad_norm": 1.6877518690527404, + "language_loss": 0.7288053, + "learning_rate": 3.7095598922801187e-06, + "loss": 0.75071585, + "num_input_tokens_seen": 34963280, + "step": 1651, + "time_per_iteration": 2.7213032245635986 + }, + { + "auxiliary_loss_clip": 0.01221892, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.06495404, + "balance_loss_mlp": 1.02660072, + "epoch": 0.19864125533577828, + "flos": 23105894883840.0, + "grad_norm": 2.998001063498881, + "language_loss": 0.7636205, + "learning_rate": 3.7091554836576914e-06, + "loss": 0.78620857, + "num_input_tokens_seen": 34979955, + "step": 1652, + "time_per_iteration": 3.3028557300567627 + }, + { + "auxiliary_loss_clip": 0.01205783, + "auxiliary_loss_mlp": 0.00765017, + "balance_loss_clip": 1.0644052, + "balance_loss_mlp": 1.00104976, + "epoch": 0.19876149822641737, + "flos": 24608505553920.0, + "grad_norm": 1.7078364518663895, + "language_loss": 0.82894939, + "learning_rate": 3.708750815757736e-06, + "loss": 0.84865743, + "num_input_tokens_seen": 35000725, + "step": 1653, + "time_per_iteration": 4.1630539894104 + }, + { + "auxiliary_loss_clip": 0.01208338, + "auxiliary_loss_mlp": 0.01040195, + "balance_loss_clip": 1.06394792, + "balance_loss_mlp": 1.03007448, + "epoch": 0.19888174111705645, + "flos": 32196645308160.0, + "grad_norm": 2.4355750053133414, + "language_loss": 0.73480725, + "learning_rate": 3.7083458886416407e-06, + "loss": 0.75729263, + "num_input_tokens_seen": 35019920, + "step": 1654, + "time_per_iteration": 2.5967397689819336 + }, + { + "auxiliary_loss_clip": 0.01152681, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.05905902, + "balance_loss_mlp": 1.02455306, + "epoch": 0.19900198400769553, + "flos": 24608469640320.0, + "grad_norm": 2.2523856304074594, + "language_loss": 0.87917298, + "learning_rate": 3.707940702370832e-06, + "loss": 0.90104574, + "num_input_tokens_seen": 35040765, + "step": 1655, + "time_per_iteration": 2.7132151126861572 + }, + { + "auxiliary_loss_clip": 0.01111372, + "auxiliary_loss_mlp": 0.01003066, + "balance_loss_clip": 1.03698659, + "balance_loss_mlp": 1.00069416, + "epoch": 0.19912222689833464, + "flos": 67915805673600.0, + "grad_norm": 0.7581153224962618, + "language_loss": 0.58303392, + "learning_rate": 3.707535257006777e-06, + "loss": 0.60417831, + "num_input_tokens_seen": 35106390, + "step": 1656, + "time_per_iteration": 3.175356149673462 + }, + { + "auxiliary_loss_clip": 0.01193514, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.06086111, + "balance_loss_mlp": 1.02853, + "epoch": 0.19924246978897373, + "flos": 15742340916480.0, + "grad_norm": 2.4797533285055735, + "language_loss": 0.88435018, + "learning_rate": 3.707129552610981e-06, + "loss": 0.90667754, + "num_input_tokens_seen": 35125040, + "step": 1657, + "time_per_iteration": 2.524670362472534 + }, + { + "auxiliary_loss_clip": 0.01187576, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.06300759, + "balance_loss_mlp": 1.02218461, + "epoch": 0.1993627126796128, + "flos": 17566566986880.0, + "grad_norm": 2.0764022080352387, + "language_loss": 0.73549104, + "learning_rate": 3.70672358924499e-06, + "loss": 0.75768614, + "num_input_tokens_seen": 35144280, + "step": 1658, + "time_per_iteration": 2.53963303565979 + }, + { + "auxiliary_loss_clip": 0.01176565, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.06365061, + "balance_loss_mlp": 1.0269897, + "epoch": 0.19948295557025192, + "flos": 40843826680320.0, + "grad_norm": 2.197653805299495, + "language_loss": 0.78508693, + "learning_rate": 3.706317366970386e-06, + "loss": 0.80722463, + "num_input_tokens_seen": 35165280, + "step": 1659, + "time_per_iteration": 2.7581794261932373 + }, + { + "auxiliary_loss_clip": 0.01223309, + "auxiliary_loss_mlp": 0.00765974, + "balance_loss_clip": 1.06246698, + "balance_loss_mlp": 1.00098264, + "epoch": 0.199603198460891, + "flos": 25082418620160.0, + "grad_norm": 6.106228511336119, + "language_loss": 0.83624744, + "learning_rate": 3.705910885848795e-06, + "loss": 0.85614032, + "num_input_tokens_seen": 35183655, + "step": 1660, + "time_per_iteration": 2.5417072772979736 + }, + { + "auxiliary_loss_clip": 0.01207101, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.06376636, + "balance_loss_mlp": 1.02113295, + "epoch": 0.19972344135153008, + "flos": 20084120352000.0, + "grad_norm": 2.050175655351116, + "language_loss": 0.8430661, + "learning_rate": 3.705504145941879e-06, + "loss": 0.86544752, + "num_input_tokens_seen": 35201825, + "step": 1661, + "time_per_iteration": 2.493687868118286 + }, + { + "auxiliary_loss_clip": 0.01220762, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.06453264, + "balance_loss_mlp": 1.02156162, + "epoch": 0.1998436842421692, + "flos": 23727472761600.0, + "grad_norm": 1.8797041062696531, + "language_loss": 0.78811651, + "learning_rate": 3.7050971473113403e-06, + "loss": 0.81063873, + "num_input_tokens_seen": 35221600, + "step": 1662, + "time_per_iteration": 2.49241304397583 + }, + { + "auxiliary_loss_clip": 0.01201331, + "auxiliary_loss_mlp": 0.00764981, + "balance_loss_clip": 1.06013894, + "balance_loss_mlp": 1.00094926, + "epoch": 0.19996392713280828, + "flos": 36102361633920.0, + "grad_norm": 1.6599697666414834, + "language_loss": 0.80064654, + "learning_rate": 3.7046898900189196e-06, + "loss": 0.82030964, + "num_input_tokens_seen": 35245935, + "step": 1663, + "time_per_iteration": 2.6219301223754883 + }, + { + "auxiliary_loss_clip": 0.01181284, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.06198764, + "balance_loss_mlp": 1.03007054, + "epoch": 0.20008417002344736, + "flos": 23657662679040.0, + "grad_norm": 1.6288642803221676, + "language_loss": 0.82824743, + "learning_rate": 3.704282374126398e-06, + "loss": 0.85046256, + "num_input_tokens_seen": 35265615, + "step": 1664, + "time_per_iteration": 2.5889265537261963 + }, + { + "auxiliary_loss_clip": 0.0117494, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.05887675, + "balance_loss_mlp": 1.02223396, + "epoch": 0.20020441291408644, + "flos": 21872076664320.0, + "grad_norm": 1.6582016618941289, + "language_loss": 0.87228185, + "learning_rate": 3.7038745996955954e-06, + "loss": 0.89435303, + "num_input_tokens_seen": 35284960, + "step": 1665, + "time_per_iteration": 2.676917791366577 + }, + { + "auxiliary_loss_clip": 0.01181586, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.05892849, + "balance_loss_mlp": 1.02605915, + "epoch": 0.20032465580472555, + "flos": 23179691376000.0, + "grad_norm": 8.364605105437795, + "language_loss": 0.71882677, + "learning_rate": 3.703466566788371e-06, + "loss": 0.74099624, + "num_input_tokens_seen": 35304090, + "step": 1666, + "time_per_iteration": 2.6527786254882812 + }, + { + "auxiliary_loss_clip": 0.01185144, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.06267095, + "balance_loss_mlp": 1.02418709, + "epoch": 0.20044489869536464, + "flos": 23873521461120.0, + "grad_norm": 1.7802398091804892, + "language_loss": 0.74326873, + "learning_rate": 3.703058275466622e-06, + "loss": 0.76546741, + "num_input_tokens_seen": 35323325, + "step": 1667, + "time_per_iteration": 2.6319797039031982 + }, + { + "auxiliary_loss_clip": 0.01190139, + "auxiliary_loss_mlp": 0.01037229, + "balance_loss_clip": 1.06007433, + "balance_loss_mlp": 1.02771592, + "epoch": 0.20056514158600372, + "flos": 21945226711680.0, + "grad_norm": 1.6500551770118452, + "language_loss": 0.77622211, + "learning_rate": 3.7026497257922877e-06, + "loss": 0.79849571, + "num_input_tokens_seen": 35343635, + "step": 1668, + "time_per_iteration": 2.636244535446167 + }, + { + "auxiliary_loss_clip": 0.01156455, + "auxiliary_loss_mlp": 0.01047609, + "balance_loss_clip": 1.0545131, + "balance_loss_mlp": 1.03779793, + "epoch": 0.20068538447664283, + "flos": 23879159896320.0, + "grad_norm": 1.7287193903547278, + "language_loss": 0.8523429, + "learning_rate": 3.7022409178273436e-06, + "loss": 0.87438351, + "num_input_tokens_seen": 35364615, + "step": 1669, + "time_per_iteration": 2.7075541019439697 + }, + { + "auxiliary_loss_clip": 0.0120001, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.06105685, + "balance_loss_mlp": 1.02031028, + "epoch": 0.2008056273672819, + "flos": 18442823270400.0, + "grad_norm": 1.744699902509725, + "language_loss": 0.78678107, + "learning_rate": 3.7018318516338054e-06, + "loss": 0.80907547, + "num_input_tokens_seen": 35383775, + "step": 1670, + "time_per_iteration": 2.5497567653656006 + }, + { + "auxiliary_loss_clip": 0.01208669, + "auxiliary_loss_mlp": 0.01028947, + "balance_loss_clip": 1.062778, + "balance_loss_mlp": 1.01976764, + "epoch": 0.200925870257921, + "flos": 23659530186240.0, + "grad_norm": 2.87900845657148, + "language_loss": 0.81513917, + "learning_rate": 3.7014225272737284e-06, + "loss": 0.83751535, + "num_input_tokens_seen": 35403000, + "step": 1671, + "time_per_iteration": 2.586712121963501 + }, + { + "auxiliary_loss_clip": 0.01198212, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.06055844, + "balance_loss_mlp": 1.02484179, + "epoch": 0.20104611314856008, + "flos": 16217115909120.0, + "grad_norm": 2.2723517604150048, + "language_loss": 0.7415908, + "learning_rate": 3.701012944809207e-06, + "loss": 0.76391935, + "num_input_tokens_seen": 35420115, + "step": 1672, + "time_per_iteration": 3.333911418914795 + }, + { + "auxiliary_loss_clip": 0.01188707, + "auxiliary_loss_mlp": 0.00764452, + "balance_loss_clip": 1.06163788, + "balance_loss_mlp": 1.00105047, + "epoch": 0.2011663560391992, + "flos": 21397373498880.0, + "grad_norm": 2.3451975569181904, + "language_loss": 0.79037344, + "learning_rate": 3.700603104302374e-06, + "loss": 0.80990499, + "num_input_tokens_seen": 35439925, + "step": 1673, + "time_per_iteration": 2.618485689163208 + }, + { + "auxiliary_loss_clip": 0.01070271, + "auxiliary_loss_mlp": 0.01003165, + "balance_loss_clip": 1.02871227, + "balance_loss_mlp": 1.00064969, + "epoch": 0.20128659892983827, + "flos": 62229459409920.0, + "grad_norm": 0.8881138279454771, + "language_loss": 0.55909812, + "learning_rate": 3.7001930058154027e-06, + "loss": 0.57983243, + "num_input_tokens_seen": 35504885, + "step": 1674, + "time_per_iteration": 3.2131967544555664 + }, + { + "auxiliary_loss_clip": 0.01173404, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.05735135, + "balance_loss_mlp": 1.02799642, + "epoch": 0.20140684182047736, + "flos": 28438737448320.0, + "grad_norm": 2.282931835369254, + "language_loss": 0.80218577, + "learning_rate": 3.6997826494105037e-06, + "loss": 0.82430023, + "num_input_tokens_seen": 35525330, + "step": 1675, + "time_per_iteration": 2.6722702980041504 + }, + { + "auxiliary_loss_clip": 0.01189185, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.06025648, + "balance_loss_mlp": 1.02014899, + "epoch": 0.20152708471111647, + "flos": 28074064619520.0, + "grad_norm": 2.0277036115078615, + "language_loss": 0.69524276, + "learning_rate": 3.6993720351499286e-06, + "loss": 0.71743041, + "num_input_tokens_seen": 35546455, + "step": 1676, + "time_per_iteration": 2.6323435306549072 + }, + { + "auxiliary_loss_clip": 0.01183767, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.06196809, + "balance_loss_mlp": 1.02442551, + "epoch": 0.20164732760175555, + "flos": 23549751244800.0, + "grad_norm": 1.7119437772356263, + "language_loss": 0.77048182, + "learning_rate": 3.6989611630959666e-06, + "loss": 0.79265696, + "num_input_tokens_seen": 35565010, + "step": 1677, + "time_per_iteration": 2.6108388900756836 + }, + { + "auxiliary_loss_clip": 0.01103065, + "auxiliary_loss_mlp": 0.01000476, + "balance_loss_clip": 1.02454388, + "balance_loss_mlp": 0.99802065, + "epoch": 0.20176757049239463, + "flos": 71100616037760.0, + "grad_norm": 0.6932006752150882, + "language_loss": 0.58356351, + "learning_rate": 3.6985500333109474e-06, + "loss": 0.60459894, + "num_input_tokens_seen": 35633340, + "step": 1678, + "time_per_iteration": 3.211085081100464 + }, + { + "auxiliary_loss_clip": 0.01165351, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.0536654, + "balance_loss_mlp": 1.0252645, + "epoch": 0.20188781338303372, + "flos": 21430159637760.0, + "grad_norm": 2.292766218170489, + "language_loss": 0.7631526, + "learning_rate": 3.6981386458572385e-06, + "loss": 0.78515017, + "num_input_tokens_seen": 35651315, + "step": 1679, + "time_per_iteration": 5.008856773376465 + }, + { + "auxiliary_loss_clip": 0.01169122, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.05624056, + "balance_loss_mlp": 1.02888584, + "epoch": 0.20200805627367283, + "flos": 11546215130880.0, + "grad_norm": 2.6081241469431906, + "language_loss": 0.7658906, + "learning_rate": 3.6977270007972468e-06, + "loss": 0.78797245, + "num_input_tokens_seen": 35668850, + "step": 1680, + "time_per_iteration": 3.4652955532073975 + }, + { + "auxiliary_loss_clip": 0.01191109, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.06098557, + "balance_loss_mlp": 1.02419519, + "epoch": 0.2021282991643119, + "flos": 28545391906560.0, + "grad_norm": 3.541670296225184, + "language_loss": 0.7194438, + "learning_rate": 3.6973150981934196e-06, + "loss": 0.7416926, + "num_input_tokens_seen": 35690080, + "step": 1681, + "time_per_iteration": 2.653881549835205 + }, + { + "auxiliary_loss_clip": 0.01221046, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.06283867, + "balance_loss_mlp": 1.025846, + "epoch": 0.202248542054951, + "flos": 17923446564480.0, + "grad_norm": 2.4605282363226184, + "language_loss": 0.84155953, + "learning_rate": 3.6969029381082415e-06, + "loss": 0.86412597, + "num_input_tokens_seen": 35706075, + "step": 1682, + "time_per_iteration": 2.495000123977661 + }, + { + "auxiliary_loss_clip": 0.01184506, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.05899298, + "balance_loss_mlp": 1.02191353, + "epoch": 0.2023687849455901, + "flos": 19864634296320.0, + "grad_norm": 3.9453847076596618, + "language_loss": 0.79655206, + "learning_rate": 3.696490520604237e-06, + "loss": 0.81870985, + "num_input_tokens_seen": 35724765, + "step": 1683, + "time_per_iteration": 2.578317403793335 + }, + { + "auxiliary_loss_clip": 0.01198847, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.0611608, + "balance_loss_mlp": 1.01688099, + "epoch": 0.20248902783622919, + "flos": 22564721600640.0, + "grad_norm": 1.7052939037775134, + "language_loss": 0.80656451, + "learning_rate": 3.696077845743968e-06, + "loss": 0.82881212, + "num_input_tokens_seen": 35744355, + "step": 1684, + "time_per_iteration": 2.5487425327301025 + }, + { + "auxiliary_loss_clip": 0.01221345, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.06353426, + "balance_loss_mlp": 1.02342391, + "epoch": 0.20260927072686827, + "flos": 22709728805760.0, + "grad_norm": 2.3815747828739964, + "language_loss": 0.73337722, + "learning_rate": 3.69566491359004e-06, + "loss": 0.75592554, + "num_input_tokens_seen": 35761000, + "step": 1685, + "time_per_iteration": 2.528510093688965 + }, + { + "auxiliary_loss_clip": 0.0118516, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.05817342, + "balance_loss_mlp": 1.02492011, + "epoch": 0.20272951361750738, + "flos": 51023998650240.0, + "grad_norm": 3.933181495170971, + "language_loss": 0.69612551, + "learning_rate": 3.695251724205092e-06, + "loss": 0.71832597, + "num_input_tokens_seen": 35785360, + "step": 1686, + "time_per_iteration": 2.8178625106811523 + }, + { + "auxiliary_loss_clip": 0.01216492, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.06177151, + "balance_loss_mlp": 1.02323294, + "epoch": 0.20284975650814646, + "flos": 26578133879040.0, + "grad_norm": 1.6286031159580288, + "language_loss": 0.86638433, + "learning_rate": 3.6948382776518054e-06, + "loss": 0.8888793, + "num_input_tokens_seen": 35806065, + "step": 1687, + "time_per_iteration": 2.5669915676116943 + }, + { + "auxiliary_loss_clip": 0.01181965, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.05671179, + "balance_loss_mlp": 1.02941513, + "epoch": 0.20296999939878554, + "flos": 16034222833920.0, + "grad_norm": 4.514600189293757, + "language_loss": 0.79371703, + "learning_rate": 3.6944245739929e-06, + "loss": 0.81592834, + "num_input_tokens_seen": 35822225, + "step": 1688, + "time_per_iteration": 2.549323797225952 + }, + { + "auxiliary_loss_clip": 0.01201791, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.06029034, + "balance_loss_mlp": 1.03046536, + "epoch": 0.20309024228942463, + "flos": 19203374868480.0, + "grad_norm": 6.2101087334099425, + "language_loss": 0.71964628, + "learning_rate": 3.6940106132911332e-06, + "loss": 0.74206793, + "num_input_tokens_seen": 35839410, + "step": 1689, + "time_per_iteration": 2.533395767211914 + }, + { + "auxiliary_loss_clip": 0.01207532, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.06277561, + "balance_loss_mlp": 1.02512527, + "epoch": 0.20321048518006374, + "flos": 22821087945600.0, + "grad_norm": 1.7585589232562562, + "language_loss": 0.88923246, + "learning_rate": 3.6935963956093037e-06, + "loss": 0.91165096, + "num_input_tokens_seen": 35859495, + "step": 1690, + "time_per_iteration": 2.545464038848877 + }, + { + "auxiliary_loss_clip": 0.01194583, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.05826819, + "balance_loss_mlp": 1.02330697, + "epoch": 0.20333072807070282, + "flos": 19096397187840.0, + "grad_norm": 1.734803479224798, + "language_loss": 0.69162923, + "learning_rate": 3.6931819210102474e-06, + "loss": 0.71389842, + "num_input_tokens_seen": 35878890, + "step": 1691, + "time_per_iteration": 2.5537819862365723 + }, + { + "auxiliary_loss_clip": 0.0122165, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.06419921, + "balance_loss_mlp": 1.02537251, + "epoch": 0.2034509709613419, + "flos": 18180962144640.0, + "grad_norm": 2.022279199970276, + "language_loss": 0.8474915, + "learning_rate": 3.6927671895568402e-06, + "loss": 0.87005955, + "num_input_tokens_seen": 35897950, + "step": 1692, + "time_per_iteration": 2.5202362537384033 + }, + { + "auxiliary_loss_clip": 0.01218279, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.0636524, + "balance_loss_mlp": 1.02619874, + "epoch": 0.20357121385198101, + "flos": 22923899648640.0, + "grad_norm": 1.8241207289876609, + "language_loss": 0.86901534, + "learning_rate": 3.692352201311996e-06, + "loss": 0.89155555, + "num_input_tokens_seen": 35916800, + "step": 1693, + "time_per_iteration": 2.512848138809204 + }, + { + "auxiliary_loss_clip": 0.01169803, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.05633664, + "balance_loss_mlp": 1.02000093, + "epoch": 0.2036914567426201, + "flos": 20922131629440.0, + "grad_norm": 1.7842334458437972, + "language_loss": 0.76321661, + "learning_rate": 3.6919369563386687e-06, + "loss": 0.78520793, + "num_input_tokens_seen": 35936600, + "step": 1694, + "time_per_iteration": 2.620120048522949 + }, + { + "auxiliary_loss_clip": 0.01187043, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.06135082, + "balance_loss_mlp": 1.02387381, + "epoch": 0.20381169963325918, + "flos": 15519155760000.0, + "grad_norm": 2.6611281919122014, + "language_loss": 0.78831178, + "learning_rate": 3.69152145469985e-06, + "loss": 0.81050956, + "num_input_tokens_seen": 35953645, + "step": 1695, + "time_per_iteration": 2.553910255432129 + }, + { + "auxiliary_loss_clip": 0.01162611, + "auxiliary_loss_mlp": 0.01044852, + "balance_loss_clip": 1.05558705, + "balance_loss_mlp": 1.03397417, + "epoch": 0.20393194252389826, + "flos": 28833143760000.0, + "grad_norm": 2.0520999050451434, + "language_loss": 0.81947726, + "learning_rate": 3.691105696458572e-06, + "loss": 0.8415519, + "num_input_tokens_seen": 35970940, + "step": 1696, + "time_per_iteration": 2.6742448806762695 + }, + { + "auxiliary_loss_clip": 0.01220367, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.06654644, + "balance_loss_mlp": 1.02135515, + "epoch": 0.20405218541453737, + "flos": 22488554810880.0, + "grad_norm": 4.328252715624741, + "language_loss": 0.67752624, + "learning_rate": 3.690689681677904e-06, + "loss": 0.70003736, + "num_input_tokens_seen": 35989410, + "step": 1697, + "time_per_iteration": 2.527660846710205 + }, + { + "auxiliary_loss_clip": 0.01188608, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.05847466, + "balance_loss_mlp": 1.02272201, + "epoch": 0.20417242830517646, + "flos": 25374408278400.0, + "grad_norm": 1.7069664040136512, + "language_loss": 0.88590586, + "learning_rate": 3.690273410420956e-06, + "loss": 0.90810943, + "num_input_tokens_seen": 36009175, + "step": 1698, + "time_per_iteration": 2.61661434173584 + }, + { + "auxiliary_loss_clip": 0.01202719, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.06076145, + "balance_loss_mlp": 1.02700412, + "epoch": 0.20429267119581554, + "flos": 14793078240000.0, + "grad_norm": 4.383743480557219, + "language_loss": 0.76848876, + "learning_rate": 3.689856882750875e-06, + "loss": 0.79087889, + "num_input_tokens_seen": 36024375, + "step": 1699, + "time_per_iteration": 3.3093175888061523 + }, + { + "auxiliary_loss_clip": 0.01200932, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.06264687, + "balance_loss_mlp": 1.02391386, + "epoch": 0.20441291408645465, + "flos": 17781851151360.0, + "grad_norm": 1.8709501683404441, + "language_loss": 0.78793871, + "learning_rate": 3.6894400987308486e-06, + "loss": 0.81027502, + "num_input_tokens_seen": 36041895, + "step": 1700, + "time_per_iteration": 2.4629805088043213 + }, + { + "auxiliary_loss_clip": 0.01207364, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.06224144, + "balance_loss_mlp": 1.02873802, + "epoch": 0.20453315697709373, + "flos": 16435668211200.0, + "grad_norm": 1.8963789630471082, + "language_loss": 0.84851086, + "learning_rate": 3.6890230584241024e-06, + "loss": 0.87096936, + "num_input_tokens_seen": 36058825, + "step": 1701, + "time_per_iteration": 2.473630905151367 + }, + { + "auxiliary_loss_clip": 0.01117082, + "auxiliary_loss_mlp": 0.01017989, + "balance_loss_clip": 1.02653635, + "balance_loss_mlp": 1.01595056, + "epoch": 0.20465339986773282, + "flos": 66713085653760.0, + "grad_norm": 1.079628427469279, + "language_loss": 0.66417402, + "learning_rate": 3.6886057618939016e-06, + "loss": 0.6855247, + "num_input_tokens_seen": 36121645, + "step": 1702, + "time_per_iteration": 3.0877153873443604 + }, + { + "auxiliary_loss_clip": 0.01166953, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.0561502, + "balance_loss_mlp": 1.03309953, + "epoch": 0.2047736427583719, + "flos": 41974114924800.0, + "grad_norm": 1.9040943911125248, + "language_loss": 0.6910609, + "learning_rate": 3.6881882092035492e-06, + "loss": 0.71316254, + "num_input_tokens_seen": 36143030, + "step": 1703, + "time_per_iteration": 2.7542049884796143 + }, + { + "auxiliary_loss_clip": 0.01088371, + "auxiliary_loss_mlp": 0.00755769, + "balance_loss_clip": 1.02766347, + "balance_loss_mlp": 1.00053215, + "epoch": 0.204893885649011, + "flos": 69940878641280.0, + "grad_norm": 0.9258463559652654, + "language_loss": 0.61184669, + "learning_rate": 3.6877704004163873e-06, + "loss": 0.630288, + "num_input_tokens_seen": 36203435, + "step": 1704, + "time_per_iteration": 3.2626121044158936 + }, + { + "auxiliary_loss_clip": 0.01220419, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.06428337, + "balance_loss_mlp": 1.02348018, + "epoch": 0.2050141285396501, + "flos": 22200012858240.0, + "grad_norm": 2.175912443947148, + "language_loss": 0.77588618, + "learning_rate": 3.6873523355957984e-06, + "loss": 0.79842865, + "num_input_tokens_seen": 36222435, + "step": 1705, + "time_per_iteration": 3.327622413635254 + }, + { + "auxiliary_loss_clip": 0.01115365, + "auxiliary_loss_mlp": 0.01006685, + "balance_loss_clip": 1.02520096, + "balance_loss_mlp": 1.0046463, + "epoch": 0.20513437143028918, + "flos": 46283721730560.0, + "grad_norm": 0.9799203017007736, + "language_loss": 0.6413393, + "learning_rate": 3.686934014805201e-06, + "loss": 0.66255981, + "num_input_tokens_seen": 36273065, + "step": 1706, + "time_per_iteration": 3.7156033515930176 + }, + { + "auxiliary_loss_clip": 0.01202934, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.06356168, + "balance_loss_mlp": 1.02921748, + "epoch": 0.20525461432092829, + "flos": 21904324099200.0, + "grad_norm": 1.8423043649697464, + "language_loss": 0.81273705, + "learning_rate": 3.6865154381080552e-06, + "loss": 0.83515418, + "num_input_tokens_seen": 36293750, + "step": 1707, + "time_per_iteration": 3.3514809608459473 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.05236316, + "balance_loss_mlp": 1.02069092, + "epoch": 0.20537485721156737, + "flos": 21214264942080.0, + "grad_norm": 1.94272526529882, + "language_loss": 0.82480317, + "learning_rate": 3.6860966055678585e-06, + "loss": 0.84638226, + "num_input_tokens_seen": 36310105, + "step": 1708, + "time_per_iteration": 2.6890857219696045 + }, + { + "auxiliary_loss_clip": 0.01205871, + "auxiliary_loss_mlp": 0.01041815, + "balance_loss_clip": 1.06422508, + "balance_loss_mlp": 1.0317837, + "epoch": 0.20549510010220645, + "flos": 20191205773440.0, + "grad_norm": 1.8327083175605325, + "language_loss": 0.86357588, + "learning_rate": 3.685677517248147e-06, + "loss": 0.88605273, + "num_input_tokens_seen": 36328995, + "step": 1709, + "time_per_iteration": 2.5011420249938965 + }, + { + "auxiliary_loss_clip": 0.01189419, + "auxiliary_loss_mlp": 0.00764406, + "balance_loss_clip": 1.06558514, + "balance_loss_mlp": 1.00087571, + "epoch": 0.20561534299284553, + "flos": 17016702612480.0, + "grad_norm": 1.8940528465644986, + "language_loss": 0.80435836, + "learning_rate": 3.6852581732124967e-06, + "loss": 0.82389659, + "num_input_tokens_seen": 36346340, + "step": 1710, + "time_per_iteration": 2.5553958415985107 + }, + { + "auxiliary_loss_clip": 0.01206969, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.06480241, + "balance_loss_mlp": 1.02351213, + "epoch": 0.20573558588348465, + "flos": 22890467064960.0, + "grad_norm": 2.9550287906302084, + "language_loss": 0.76077873, + "learning_rate": 3.6848385735245213e-06, + "loss": 0.78318411, + "num_input_tokens_seen": 36365430, + "step": 1711, + "time_per_iteration": 2.545814275741577 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.05517054, + "balance_loss_mlp": 1.02336097, + "epoch": 0.20585582877412373, + "flos": 24643123286400.0, + "grad_norm": 1.816164872469563, + "language_loss": 0.86060917, + "learning_rate": 3.6844187182478734e-06, + "loss": 0.88280165, + "num_input_tokens_seen": 36386285, + "step": 1712, + "time_per_iteration": 2.5832128524780273 + }, + { + "auxiliary_loss_clip": 0.01178493, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.05583549, + "balance_loss_mlp": 1.02099848, + "epoch": 0.2059760716647628, + "flos": 24206952435840.0, + "grad_norm": 1.656136867252933, + "language_loss": 0.74730504, + "learning_rate": 3.683998607446246e-06, + "loss": 0.76939458, + "num_input_tokens_seen": 36404935, + "step": 1713, + "time_per_iteration": 2.5760860443115234 + }, + { + "auxiliary_loss_clip": 0.01204658, + "auxiliary_loss_mlp": 0.01045292, + "balance_loss_clip": 1.06433654, + "balance_loss_mlp": 1.03669727, + "epoch": 0.20609631455540192, + "flos": 20229522606720.0, + "grad_norm": 1.8677170672975212, + "language_loss": 0.75019348, + "learning_rate": 3.6835782411833686e-06, + "loss": 0.77269304, + "num_input_tokens_seen": 36424455, + "step": 1714, + "time_per_iteration": 2.557457447052002 + }, + { + "auxiliary_loss_clip": 0.01163789, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.05579591, + "balance_loss_mlp": 1.02487016, + "epoch": 0.206216557446041, + "flos": 19864957518720.0, + "grad_norm": 1.9216832351580975, + "language_loss": 0.74329787, + "learning_rate": 3.68315761952301e-06, + "loss": 0.76528215, + "num_input_tokens_seen": 36441685, + "step": 1715, + "time_per_iteration": 2.557408094406128 + }, + { + "auxiliary_loss_clip": 0.01220315, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.06523728, + "balance_loss_mlp": 1.02794552, + "epoch": 0.2063368003366801, + "flos": 24096311568000.0, + "grad_norm": 2.007141782840156, + "language_loss": 0.83047593, + "learning_rate": 3.6827367425289797e-06, + "loss": 0.85305476, + "num_input_tokens_seen": 36461460, + "step": 1716, + "time_per_iteration": 2.5541040897369385 + }, + { + "auxiliary_loss_clip": 0.01190372, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.06114793, + "balance_loss_mlp": 1.02061439, + "epoch": 0.2064570432273192, + "flos": 20340163474560.0, + "grad_norm": 2.7823186274286016, + "language_loss": 0.72970492, + "learning_rate": 3.6823156102651225e-06, + "loss": 0.7519176, + "num_input_tokens_seen": 36479615, + "step": 1717, + "time_per_iteration": 2.5223388671875 + }, + { + "auxiliary_loss_clip": 0.01134096, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.05721724, + "balance_loss_mlp": 1.02146244, + "epoch": 0.20657728611795828, + "flos": 20520363029760.0, + "grad_norm": 1.8698432655575277, + "language_loss": 0.70648134, + "learning_rate": 3.6818942227953257e-06, + "loss": 0.7281332, + "num_input_tokens_seen": 36500160, + "step": 1718, + "time_per_iteration": 2.614090919494629 + }, + { + "auxiliary_loss_clip": 0.01175611, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.0603174, + "balance_loss_mlp": 1.02132297, + "epoch": 0.20669752900859736, + "flos": 21799285752960.0, + "grad_norm": 1.9106012471330371, + "language_loss": 0.69166601, + "learning_rate": 3.681472580183512e-06, + "loss": 0.71373242, + "num_input_tokens_seen": 36518810, + "step": 1719, + "time_per_iteration": 2.5682260990142822 + }, + { + "auxiliary_loss_clip": 0.0120147, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.06394529, + "balance_loss_mlp": 1.02333891, + "epoch": 0.20681777189923645, + "flos": 15122020014720.0, + "grad_norm": 1.7915004781149155, + "language_loss": 0.86325067, + "learning_rate": 3.6810506824936455e-06, + "loss": 0.88558865, + "num_input_tokens_seen": 36536890, + "step": 1720, + "time_per_iteration": 2.47111177444458 + }, + { + "auxiliary_loss_clip": 0.01087482, + "auxiliary_loss_mlp": 0.010051, + "balance_loss_clip": 1.02257824, + "balance_loss_mlp": 1.00282288, + "epoch": 0.20693801478987556, + "flos": 56481021509760.0, + "grad_norm": 1.1317507893838084, + "language_loss": 0.62530267, + "learning_rate": 3.680628529789726e-06, + "loss": 0.64622855, + "num_input_tokens_seen": 36589300, + "step": 1721, + "time_per_iteration": 2.94596266746521 + }, + { + "auxiliary_loss_clip": 0.012262, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.0663687, + "balance_loss_mlp": 1.02882624, + "epoch": 0.20705825768051464, + "flos": 21614201948160.0, + "grad_norm": 1.8765563320858571, + "language_loss": 0.85612869, + "learning_rate": 3.680206122135796e-06, + "loss": 0.87878513, + "num_input_tokens_seen": 36609905, + "step": 1722, + "time_per_iteration": 2.4761531352996826 + }, + { + "auxiliary_loss_clip": 0.0116922, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.06507683, + "balance_loss_mlp": 1.02997386, + "epoch": 0.20717850057115372, + "flos": 25848895962240.0, + "grad_norm": 1.803188406557687, + "language_loss": 0.78218549, + "learning_rate": 3.6797834595959323e-06, + "loss": 0.80426466, + "num_input_tokens_seen": 36629805, + "step": 1723, + "time_per_iteration": 2.61983323097229 + }, + { + "auxiliary_loss_clip": 0.01147727, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.05313325, + "balance_loss_mlp": 1.03179991, + "epoch": 0.20729874346179283, + "flos": 29130807767040.0, + "grad_norm": 2.4710442945737277, + "language_loss": 0.78367615, + "learning_rate": 3.679360542234254e-06, + "loss": 0.80557466, + "num_input_tokens_seen": 36649150, + "step": 1724, + "time_per_iteration": 2.654662847518921 + }, + { + "auxiliary_loss_clip": 0.01182316, + "auxiliary_loss_mlp": 0.00764963, + "balance_loss_clip": 1.05590117, + "balance_loss_mlp": 1.00089419, + "epoch": 0.20741898635243192, + "flos": 29023363209600.0, + "grad_norm": 1.6746600180368396, + "language_loss": 0.72186583, + "learning_rate": 3.678937370114916e-06, + "loss": 0.74133861, + "num_input_tokens_seen": 36668955, + "step": 1725, + "time_per_iteration": 3.48898983001709 + }, + { + "auxiliary_loss_clip": 0.01184175, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.06155932, + "balance_loss_mlp": 1.02088475, + "epoch": 0.207539229243071, + "flos": 15559447841280.0, + "grad_norm": 1.9454122270823844, + "language_loss": 0.78826487, + "learning_rate": 3.678513943302114e-06, + "loss": 0.81040084, + "num_input_tokens_seen": 36685730, + "step": 1726, + "time_per_iteration": 2.545203924179077 + }, + { + "auxiliary_loss_clip": 0.0121827, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.06433117, + "balance_loss_mlp": 1.02687383, + "epoch": 0.20765947213371008, + "flos": 20521081301760.0, + "grad_norm": 1.6933242353386795, + "language_loss": 0.8538264, + "learning_rate": 3.678090261860082e-06, + "loss": 0.87636709, + "num_input_tokens_seen": 36705460, + "step": 1727, + "time_per_iteration": 2.5430078506469727 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.05392444, + "balance_loss_mlp": 1.02663016, + "epoch": 0.2077797150243492, + "flos": 19354415558400.0, + "grad_norm": 1.8073339637475292, + "language_loss": 0.77326918, + "learning_rate": 3.6776663258530906e-06, + "loss": 0.7953524, + "num_input_tokens_seen": 36724110, + "step": 1728, + "time_per_iteration": 2.6801679134368896 + }, + { + "auxiliary_loss_clip": 0.01206503, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.06291127, + "balance_loss_mlp": 1.0240382, + "epoch": 0.20789995791498828, + "flos": 21829952989440.0, + "grad_norm": 1.8307166220351654, + "language_loss": 0.71375144, + "learning_rate": 3.6772421353454516e-06, + "loss": 0.73614573, + "num_input_tokens_seen": 36742705, + "step": 1729, + "time_per_iteration": 2.6156044006347656 + }, + { + "auxiliary_loss_clip": 0.01202153, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.06386757, + "balance_loss_mlp": 1.02293587, + "epoch": 0.20802020080562736, + "flos": 23148844571520.0, + "grad_norm": 8.882434085461293, + "language_loss": 0.8839817, + "learning_rate": 3.6768176904015153e-06, + "loss": 0.90632546, + "num_input_tokens_seen": 36762510, + "step": 1730, + "time_per_iteration": 2.529400587081909 + }, + { + "auxiliary_loss_clip": 0.01203104, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.06127071, + "balance_loss_mlp": 1.02921844, + "epoch": 0.20814044369626647, + "flos": 23072677781760.0, + "grad_norm": 2.6368028659940492, + "language_loss": 0.60422206, + "learning_rate": 3.6763929910856674e-06, + "loss": 0.62663811, + "num_input_tokens_seen": 36780960, + "step": 1731, + "time_per_iteration": 3.3543660640716553 + }, + { + "auxiliary_loss_clip": 0.01203673, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_clip": 1.06389642, + "balance_loss_mlp": 1.03196144, + "epoch": 0.20826068658690555, + "flos": 19608016556160.0, + "grad_norm": 2.414765034132618, + "language_loss": 0.77754235, + "learning_rate": 3.6759680374623365e-06, + "loss": 0.79999292, + "num_input_tokens_seen": 36798875, + "step": 1732, + "time_per_iteration": 3.3706953525543213 + }, + { + "auxiliary_loss_clip": 0.01216353, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.06389928, + "balance_loss_mlp": 1.02096367, + "epoch": 0.20838092947754464, + "flos": 25374049142400.0, + "grad_norm": 2.383951389535304, + "language_loss": 0.75453377, + "learning_rate": 3.675542829595986e-06, + "loss": 0.7770012, + "num_input_tokens_seen": 36818540, + "step": 1733, + "time_per_iteration": 3.331346035003662 + }, + { + "auxiliary_loss_clip": 0.01187756, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.05992317, + "balance_loss_mlp": 1.02521074, + "epoch": 0.20850117236818372, + "flos": 24061729749120.0, + "grad_norm": 1.437901828774705, + "language_loss": 0.79396731, + "learning_rate": 3.6751173675511213e-06, + "loss": 0.81619018, + "num_input_tokens_seen": 36840585, + "step": 1734, + "time_per_iteration": 2.581486463546753 + }, + { + "auxiliary_loss_clip": 0.01182469, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.05398667, + "balance_loss_mlp": 1.02736163, + "epoch": 0.20862141525882283, + "flos": 20077799558400.0, + "grad_norm": 2.1602936115016993, + "language_loss": 0.87514347, + "learning_rate": 3.674691651392283e-06, + "loss": 0.89733005, + "num_input_tokens_seen": 36858255, + "step": 1735, + "time_per_iteration": 2.5433480739593506 + }, + { + "auxiliary_loss_clip": 0.01194135, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_clip": 1.0634383, + "balance_loss_mlp": 1.03649926, + "epoch": 0.2087416581494619, + "flos": 39015183237120.0, + "grad_norm": 2.4436272545325513, + "language_loss": 0.76214147, + "learning_rate": 3.674265681184053e-06, + "loss": 0.78453982, + "num_input_tokens_seen": 36881515, + "step": 1736, + "time_per_iteration": 2.6752898693084717 + }, + { + "auxiliary_loss_clip": 0.01188291, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.05806065, + "balance_loss_mlp": 1.02281713, + "epoch": 0.208861901040101, + "flos": 26101994169600.0, + "grad_norm": 1.6540068303848314, + "language_loss": 0.86296463, + "learning_rate": 3.6738394569910504e-06, + "loss": 0.885167, + "num_input_tokens_seen": 36902055, + "step": 1737, + "time_per_iteration": 2.56394362449646 + }, + { + "auxiliary_loss_clip": 0.01202918, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.06375539, + "balance_loss_mlp": 1.02349091, + "epoch": 0.2089821439307401, + "flos": 28398732675840.0, + "grad_norm": 2.2776346492230375, + "language_loss": 0.82504511, + "learning_rate": 3.6734129788779333e-06, + "loss": 0.8473981, + "num_input_tokens_seen": 36921230, + "step": 1738, + "time_per_iteration": 2.5582215785980225 + }, + { + "auxiliary_loss_clip": 0.01170748, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.06071949, + "balance_loss_mlp": 1.02264166, + "epoch": 0.2091023868213792, + "flos": 21069616872960.0, + "grad_norm": 1.6406682291254882, + "language_loss": 0.90560412, + "learning_rate": 3.6729862469093976e-06, + "loss": 0.92763019, + "num_input_tokens_seen": 36940325, + "step": 1739, + "time_per_iteration": 2.5706984996795654 + }, + { + "auxiliary_loss_clip": 0.01172463, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.05536759, + "balance_loss_mlp": 1.02911961, + "epoch": 0.20922262971201827, + "flos": 22455481363200.0, + "grad_norm": 2.4190153924251176, + "language_loss": 0.82254064, + "learning_rate": 3.6725592611501782e-06, + "loss": 0.84464955, + "num_input_tokens_seen": 36959000, + "step": 1740, + "time_per_iteration": 2.5368447303771973 + }, + { + "auxiliary_loss_clip": 0.01200981, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.06038165, + "balance_loss_mlp": 1.02958798, + "epoch": 0.20934287260265738, + "flos": 27852244179840.0, + "grad_norm": 3.252599703194495, + "language_loss": 0.7698943, + "learning_rate": 3.6721320216650496e-06, + "loss": 0.79229492, + "num_input_tokens_seen": 36979615, + "step": 1741, + "time_per_iteration": 2.5715086460113525 + }, + { + "auxiliary_loss_clip": 0.01185693, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.0593679, + "balance_loss_mlp": 1.03079963, + "epoch": 0.20946311549329646, + "flos": 16435309075200.0, + "grad_norm": 1.6848133030444423, + "language_loss": 0.83857214, + "learning_rate": 3.6717045285188215e-06, + "loss": 0.86083245, + "num_input_tokens_seen": 36997310, + "step": 1742, + "time_per_iteration": 2.538525342941284 + }, + { + "auxiliary_loss_clip": 0.01141515, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.05084956, + "balance_loss_mlp": 1.02732861, + "epoch": 0.20958335838393555, + "flos": 22492720788480.0, + "grad_norm": 2.1099686823320654, + "language_loss": 0.87020957, + "learning_rate": 3.671276781776346e-06, + "loss": 0.89199394, + "num_input_tokens_seen": 37015965, + "step": 1743, + "time_per_iteration": 2.6134045124053955 + }, + { + "auxiliary_loss_clip": 0.01176265, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.05311918, + "balance_loss_mlp": 1.02415419, + "epoch": 0.20970360127457463, + "flos": 25224768218880.0, + "grad_norm": 1.906691330481457, + "language_loss": 0.67145264, + "learning_rate": 3.6708487815025128e-06, + "loss": 0.69354868, + "num_input_tokens_seen": 37036545, + "step": 1744, + "time_per_iteration": 2.58872652053833 + }, + { + "auxiliary_loss_clip": 0.01172732, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.05657196, + "balance_loss_mlp": 1.02116442, + "epoch": 0.20982384416521374, + "flos": 18479164855680.0, + "grad_norm": 2.292926979330607, + "language_loss": 0.74404144, + "learning_rate": 3.6704205277622463e-06, + "loss": 0.7660749, + "num_input_tokens_seen": 37054985, + "step": 1745, + "time_per_iteration": 2.6021785736083984 + }, + { + "auxiliary_loss_clip": 0.01187591, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.05626488, + "balance_loss_mlp": 1.0242219, + "epoch": 0.20994408705585282, + "flos": 25373546352000.0, + "grad_norm": 1.6762954296293981, + "language_loss": 0.80512577, + "learning_rate": 3.6699920206205146e-06, + "loss": 0.82733583, + "num_input_tokens_seen": 37075725, + "step": 1746, + "time_per_iteration": 2.5860376358032227 + }, + { + "auxiliary_loss_clip": 0.01201971, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.05970824, + "balance_loss_mlp": 1.02510393, + "epoch": 0.2100643299464919, + "flos": 21320955313920.0, + "grad_norm": 2.0023847954338843, + "language_loss": 0.81598699, + "learning_rate": 3.669563260142321e-06, + "loss": 0.83834743, + "num_input_tokens_seen": 37094615, + "step": 1747, + "time_per_iteration": 2.5021493434906006 + }, + { + "auxiliary_loss_clip": 0.01182758, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.06117511, + "balance_loss_mlp": 1.02884698, + "epoch": 0.21018457283713102, + "flos": 19354379644800.0, + "grad_norm": 1.9329789875094434, + "language_loss": 0.84209371, + "learning_rate": 3.6691342463927083e-06, + "loss": 0.86430287, + "num_input_tokens_seen": 37113610, + "step": 1748, + "time_per_iteration": 2.5173652172088623 + }, + { + "auxiliary_loss_clip": 0.01174863, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.05768466, + "balance_loss_mlp": 1.02957892, + "epoch": 0.2103048157277701, + "flos": 28330035914880.0, + "grad_norm": 1.619241627535905, + "language_loss": 0.81551063, + "learning_rate": 3.668704979436758e-06, + "loss": 0.83764911, + "num_input_tokens_seen": 37133705, + "step": 1749, + "time_per_iteration": 2.626819610595703 + }, + { + "auxiliary_loss_clip": 0.01176972, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.05405307, + "balance_loss_mlp": 1.02638865, + "epoch": 0.21042505861840918, + "flos": 17457290835840.0, + "grad_norm": 1.9597630624351245, + "language_loss": 0.78709507, + "learning_rate": 3.668275459339588e-06, + "loss": 0.80922174, + "num_input_tokens_seen": 37152185, + "step": 1750, + "time_per_iteration": 2.5133371353149414 + }, + { + "auxiliary_loss_clip": 0.01216895, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.0639286, + "balance_loss_mlp": 1.02714109, + "epoch": 0.21054530150904827, + "flos": 14209817195520.0, + "grad_norm": 2.112450244642857, + "language_loss": 0.80269277, + "learning_rate": 3.667845686166358e-06, + "loss": 0.82523543, + "num_input_tokens_seen": 37169110, + "step": 1751, + "time_per_iteration": 2.430022716522217 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.05169058, + "balance_loss_mlp": 1.02035403, + "epoch": 0.21066554439968738, + "flos": 18618210403200.0, + "grad_norm": 1.7866486114255284, + "language_loss": 0.85896385, + "learning_rate": 3.6674156599822634e-06, + "loss": 0.88080275, + "num_input_tokens_seen": 37184905, + "step": 1752, + "time_per_iteration": 3.3723833560943604 + }, + { + "auxiliary_loss_clip": 0.01157478, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.0509603, + "balance_loss_mlp": 1.02996576, + "epoch": 0.21078578729032646, + "flos": 23658883741440.0, + "grad_norm": 2.2056655646680534, + "language_loss": 0.81837642, + "learning_rate": 3.666985380852539e-06, + "loss": 0.84034765, + "num_input_tokens_seen": 37203910, + "step": 1753, + "time_per_iteration": 2.600553512573242 + }, + { + "auxiliary_loss_clip": 0.01185862, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.0602572, + "balance_loss_mlp": 1.02281022, + "epoch": 0.21090603018096554, + "flos": 29346379240320.0, + "grad_norm": 3.260311458547385, + "language_loss": 0.74748254, + "learning_rate": 3.6665548488424576e-06, + "loss": 0.76966667, + "num_input_tokens_seen": 37222670, + "step": 1754, + "time_per_iteration": 2.575789451599121 + }, + { + "auxiliary_loss_clip": 0.01217189, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.06208384, + "balance_loss_mlp": 1.02701926, + "epoch": 0.21102627307160465, + "flos": 23261245205760.0, + "grad_norm": 2.0270978935915496, + "language_loss": 0.88008004, + "learning_rate": 3.6661240640173307e-06, + "loss": 0.90262526, + "num_input_tokens_seen": 37244140, + "step": 1755, + "time_per_iteration": 2.5638747215270996 + }, + { + "auxiliary_loss_clip": 0.01083785, + "auxiliary_loss_mlp": 0.01010325, + "balance_loss_clip": 1.02759123, + "balance_loss_mlp": 1.00809598, + "epoch": 0.21114651596224374, + "flos": 54633454577280.0, + "grad_norm": 0.8889622053571035, + "language_loss": 0.57912737, + "learning_rate": 3.6656930264425085e-06, + "loss": 0.60006845, + "num_input_tokens_seen": 37308185, + "step": 1756, + "time_per_iteration": 3.1736364364624023 + }, + { + "auxiliary_loss_clip": 0.01217146, + "auxiliary_loss_mlp": 0.01037821, + "balance_loss_clip": 1.06263375, + "balance_loss_mlp": 1.02786684, + "epoch": 0.21126675885288282, + "flos": 21543314457600.0, + "grad_norm": 1.7376468322884182, + "language_loss": 0.75861591, + "learning_rate": 3.665261736183378e-06, + "loss": 0.7811656, + "num_input_tokens_seen": 37328220, + "step": 1757, + "time_per_iteration": 2.507418394088745 + }, + { + "auxiliary_loss_clip": 0.01172478, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.05848813, + "balance_loss_mlp": 1.02159965, + "epoch": 0.2113870017435219, + "flos": 10961876678400.0, + "grad_norm": 2.39046261252039, + "language_loss": 0.89167386, + "learning_rate": 3.664830193305366e-06, + "loss": 0.91371524, + "num_input_tokens_seen": 37345995, + "step": 1758, + "time_per_iteration": 4.308273792266846 + }, + { + "auxiliary_loss_clip": 0.01166234, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.05290079, + "balance_loss_mlp": 1.02668428, + "epoch": 0.211507244634161, + "flos": 16653825463680.0, + "grad_norm": 6.170319221508419, + "language_loss": 0.76876199, + "learning_rate": 3.6643983978739373e-06, + "loss": 0.79078901, + "num_input_tokens_seen": 37362610, + "step": 1759, + "time_per_iteration": 3.3425939083099365 + }, + { + "auxiliary_loss_clip": 0.01180341, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.05904484, + "balance_loss_mlp": 1.02352321, + "epoch": 0.2116274875248001, + "flos": 20954091755520.0, + "grad_norm": 1.9004249772441613, + "language_loss": 0.8203029, + "learning_rate": 3.663966349954596e-06, + "loss": 0.84244156, + "num_input_tokens_seen": 37382790, + "step": 1760, + "time_per_iteration": 2.5548155307769775 + }, + { + "auxiliary_loss_clip": 0.01105109, + "auxiliary_loss_mlp": 0.01001855, + "balance_loss_clip": 1.02659369, + "balance_loss_mlp": 0.99964952, + "epoch": 0.21174773041543918, + "flos": 68196949424640.0, + "grad_norm": 0.7922156201813727, + "language_loss": 0.59808755, + "learning_rate": 3.6635340496128816e-06, + "loss": 0.6191572, + "num_input_tokens_seen": 37439720, + "step": 1761, + "time_per_iteration": 2.9759645462036133 + }, + { + "auxiliary_loss_clip": 0.01154979, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.05480206, + "balance_loss_mlp": 1.02534103, + "epoch": 0.2118679733060783, + "flos": 20668315150080.0, + "grad_norm": 1.6493272588694963, + "language_loss": 0.92545718, + "learning_rate": 3.6631014969143747e-06, + "loss": 0.94735187, + "num_input_tokens_seen": 37459410, + "step": 1762, + "time_per_iteration": 2.632343292236328 + }, + { + "auxiliary_loss_clip": 0.01206052, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_clip": 1.06574392, + "balance_loss_mlp": 1.03408027, + "epoch": 0.21198821619671737, + "flos": 23223431162880.0, + "grad_norm": 1.7015162526646155, + "language_loss": 0.89152634, + "learning_rate": 3.662668691924693e-06, + "loss": 0.91402173, + "num_input_tokens_seen": 37480460, + "step": 1763, + "time_per_iteration": 2.518904685974121 + }, + { + "auxiliary_loss_clip": 0.01170215, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.05585933, + "balance_loss_mlp": 1.02950943, + "epoch": 0.21210845908735645, + "flos": 24498547044480.0, + "grad_norm": 1.944626822653477, + "language_loss": 0.72100049, + "learning_rate": 3.6622356347094927e-06, + "loss": 0.74310178, + "num_input_tokens_seen": 37502025, + "step": 1764, + "time_per_iteration": 2.589101791381836 + }, + { + "auxiliary_loss_clip": 0.01171676, + "auxiliary_loss_mlp": 0.01037009, + "balance_loss_clip": 1.05284286, + "balance_loss_mlp": 1.02623296, + "epoch": 0.21222870197799554, + "flos": 27089789160960.0, + "grad_norm": 1.945570693918676, + "language_loss": 0.7870971, + "learning_rate": 3.6618023253344684e-06, + "loss": 0.80918401, + "num_input_tokens_seen": 37520885, + "step": 1765, + "time_per_iteration": 2.5923502445220947 + }, + { + "auxiliary_loss_clip": 0.01200865, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.0592438, + "balance_loss_mlp": 1.030792, + "epoch": 0.21234894486863465, + "flos": 16873850223360.0, + "grad_norm": 1.4454107411192525, + "language_loss": 0.83385426, + "learning_rate": 3.6613687638653527e-06, + "loss": 0.85627401, + "num_input_tokens_seen": 37539055, + "step": 1766, + "time_per_iteration": 2.4639527797698975 + }, + { + "auxiliary_loss_clip": 0.01181866, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.05760121, + "balance_loss_mlp": 1.02518988, + "epoch": 0.21246918775927373, + "flos": 23474949171840.0, + "grad_norm": 1.8126336215905998, + "language_loss": 0.7757827, + "learning_rate": 3.660934950367916e-06, + "loss": 0.79795271, + "num_input_tokens_seen": 37558300, + "step": 1767, + "time_per_iteration": 2.5510809421539307 + }, + { + "auxiliary_loss_clip": 0.01204323, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.06220198, + "balance_loss_mlp": 1.02671957, + "epoch": 0.21258943064991281, + "flos": 22382295402240.0, + "grad_norm": 1.699834246609623, + "language_loss": 0.83455169, + "learning_rate": 3.660500884907968e-06, + "loss": 0.85696107, + "num_input_tokens_seen": 37579040, + "step": 1768, + "time_per_iteration": 2.5239593982696533 + }, + { + "auxiliary_loss_clip": 0.01069117, + "auxiliary_loss_mlp": 0.01003329, + "balance_loss_clip": 1.02535892, + "balance_loss_mlp": 1.00082552, + "epoch": 0.21270967354055192, + "flos": 59440168679040.0, + "grad_norm": 0.8216218626058457, + "language_loss": 0.60000062, + "learning_rate": 3.660066567551356e-06, + "loss": 0.6207251, + "num_input_tokens_seen": 37639185, + "step": 1769, + "time_per_iteration": 3.0549306869506836 + }, + { + "auxiliary_loss_clip": 0.01200479, + "auxiliary_loss_mlp": 0.00765161, + "balance_loss_clip": 1.06048334, + "balance_loss_mlp": 1.00098419, + "epoch": 0.212829916431191, + "flos": 21544032729600.0, + "grad_norm": 2.228823779654595, + "language_loss": 0.84556282, + "learning_rate": 3.6596319983639657e-06, + "loss": 0.86521918, + "num_input_tokens_seen": 37657765, + "step": 1770, + "time_per_iteration": 2.515789747238159 + }, + { + "auxiliary_loss_clip": 0.0117351, + "auxiliary_loss_mlp": 0.00765912, + "balance_loss_clip": 1.05885124, + "balance_loss_mlp": 1.00096118, + "epoch": 0.2129501593218301, + "flos": 28987739896320.0, + "grad_norm": 1.550211201985683, + "language_loss": 0.86196232, + "learning_rate": 3.6591971774117214e-06, + "loss": 0.88135654, + "num_input_tokens_seen": 37680740, + "step": 1771, + "time_per_iteration": 2.6453745365142822 + }, + { + "auxiliary_loss_clip": 0.01207496, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.0632937, + "balance_loss_mlp": 1.03048182, + "epoch": 0.2130704022124692, + "flos": 18806993308800.0, + "grad_norm": 2.059605367534018, + "language_loss": 0.80518997, + "learning_rate": 3.6587621047605833e-06, + "loss": 0.82767105, + "num_input_tokens_seen": 37697910, + "step": 1772, + "time_per_iteration": 2.4984118938446045 + }, + { + "auxiliary_loss_clip": 0.0120295, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.06244361, + "balance_loss_mlp": 1.02975905, + "epoch": 0.21319064510310828, + "flos": 13918150759680.0, + "grad_norm": 1.9476953152537653, + "language_loss": 0.86659396, + "learning_rate": 3.6583267804765542e-06, + "loss": 0.8890146, + "num_input_tokens_seen": 37712245, + "step": 1773, + "time_per_iteration": 2.5339155197143555 + }, + { + "auxiliary_loss_clip": 0.01198622, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.05881619, + "balance_loss_mlp": 1.02641821, + "epoch": 0.21331088799374737, + "flos": 20959694277120.0, + "grad_norm": 1.685265008435033, + "language_loss": 0.85512364, + "learning_rate": 3.6578912046256702e-06, + "loss": 0.87748063, + "num_input_tokens_seen": 37730765, + "step": 1774, + "time_per_iteration": 2.5187294483184814 + }, + { + "auxiliary_loss_clip": 0.0116821, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.05329263, + "balance_loss_mlp": 1.02243781, + "epoch": 0.21343113088438645, + "flos": 18624638937600.0, + "grad_norm": 1.95408503356112, + "language_loss": 0.75892329, + "learning_rate": 3.6574553772740083e-06, + "loss": 0.78093576, + "num_input_tokens_seen": 37748695, + "step": 1775, + "time_per_iteration": 2.541170597076416 + }, + { + "auxiliary_loss_clip": 0.01094954, + "auxiliary_loss_mlp": 0.01002763, + "balance_loss_clip": 1.0269798, + "balance_loss_mlp": 1.00051045, + "epoch": 0.21355137377502556, + "flos": 67413128791680.0, + "grad_norm": 0.8493705330258402, + "language_loss": 0.61886555, + "learning_rate": 3.657019298487684e-06, + "loss": 0.63984269, + "num_input_tokens_seen": 37813705, + "step": 1776, + "time_per_iteration": 3.1203174591064453 + }, + { + "auxiliary_loss_clip": 0.01210159, + "auxiliary_loss_mlp": 0.0076584, + "balance_loss_clip": 1.06136084, + "balance_loss_mlp": 1.00101912, + "epoch": 0.21367161666566464, + "flos": 34532095697280.0, + "grad_norm": 2.6303326203727138, + "language_loss": 0.83458847, + "learning_rate": 3.6565829683328495e-06, + "loss": 0.85434848, + "num_input_tokens_seen": 37836330, + "step": 1777, + "time_per_iteration": 2.6514554023742676 + }, + { + "auxiliary_loss_clip": 0.01196245, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.05905044, + "balance_loss_mlp": 1.02734363, + "epoch": 0.21379185955630373, + "flos": 18989347680000.0, + "grad_norm": 1.6819733295980346, + "language_loss": 0.85998476, + "learning_rate": 3.6561463868756965e-06, + "loss": 0.88232303, + "num_input_tokens_seen": 37855030, + "step": 1778, + "time_per_iteration": 2.454549551010132 + }, + { + "auxiliary_loss_clip": 0.01202991, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.0637095, + "balance_loss_mlp": 1.0266782, + "epoch": 0.21391210244694284, + "flos": 28218497207040.0, + "grad_norm": 1.7194331359769421, + "language_loss": 0.78008235, + "learning_rate": 3.655709554182452e-06, + "loss": 0.80248088, + "num_input_tokens_seen": 37875370, + "step": 1779, + "time_per_iteration": 3.3246679306030273 + }, + { + "auxiliary_loss_clip": 0.01204383, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.05911446, + "balance_loss_mlp": 1.02421927, + "epoch": 0.21403234533758192, + "flos": 17455064192640.0, + "grad_norm": 2.5177189526327997, + "language_loss": 0.8467648, + "learning_rate": 3.6552724703193855e-06, + "loss": 0.86914563, + "num_input_tokens_seen": 37892560, + "step": 1780, + "time_per_iteration": 2.4566287994384766 + }, + { + "auxiliary_loss_clip": 0.01063446, + "auxiliary_loss_mlp": 0.01008573, + "balance_loss_clip": 1.02203, + "balance_loss_mlp": 1.00595033, + "epoch": 0.214152588228221, + "flos": 51637606686720.0, + "grad_norm": 0.7924905630335375, + "language_loss": 0.55932772, + "learning_rate": 3.654835135352801e-06, + "loss": 0.58004791, + "num_input_tokens_seen": 37947370, + "step": 1781, + "time_per_iteration": 3.0200095176696777 + }, + { + "auxiliary_loss_clip": 0.01154921, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.05036926, + "balance_loss_mlp": 1.02206063, + "epoch": 0.21427283111886009, + "flos": 19496154625920.0, + "grad_norm": 1.7512825860459846, + "language_loss": 0.87445223, + "learning_rate": 3.654397549349043e-06, + "loss": 0.89632404, + "num_input_tokens_seen": 37964745, + "step": 1782, + "time_per_iteration": 2.579237699508667 + }, + { + "auxiliary_loss_clip": 0.01185015, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.06101263, + "balance_loss_mlp": 1.0240159, + "epoch": 0.2143930740094992, + "flos": 20084802710400.0, + "grad_norm": 2.2447810221344535, + "language_loss": 0.75453168, + "learning_rate": 3.653959712374491e-06, + "loss": 0.77672553, + "num_input_tokens_seen": 37982850, + "step": 1783, + "time_per_iteration": 2.521242141723633 + }, + { + "auxiliary_loss_clip": 0.01166413, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.05910063, + "balance_loss_mlp": 1.01706624, + "epoch": 0.21451331690013828, + "flos": 21798603394560.0, + "grad_norm": 1.5610972534755763, + "language_loss": 0.82655156, + "learning_rate": 3.6535216244955663e-06, + "loss": 0.84848237, + "num_input_tokens_seen": 38002745, + "step": 1784, + "time_per_iteration": 3.4302234649658203 + }, + { + "auxiliary_loss_clip": 0.01185738, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.05997849, + "balance_loss_mlp": 1.02786827, + "epoch": 0.21463355979077736, + "flos": 32853882412800.0, + "grad_norm": 1.6081607400772948, + "language_loss": 0.70927966, + "learning_rate": 3.653083285778726e-06, + "loss": 0.73151606, + "num_input_tokens_seen": 38024115, + "step": 1785, + "time_per_iteration": 3.5211477279663086 + }, + { + "auxiliary_loss_clip": 0.01206595, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.06041694, + "balance_loss_mlp": 1.02140558, + "epoch": 0.21475380268141647, + "flos": 21543817248000.0, + "grad_norm": 1.9384667437333827, + "language_loss": 0.811643, + "learning_rate": 3.6526446962904653e-06, + "loss": 0.83403093, + "num_input_tokens_seen": 38042830, + "step": 1786, + "time_per_iteration": 3.3201074600219727 + }, + { + "auxiliary_loss_clip": 0.01197391, + "auxiliary_loss_mlp": 0.0104066, + "balance_loss_clip": 1.06145024, + "balance_loss_mlp": 1.03102171, + "epoch": 0.21487404557205556, + "flos": 32159082660480.0, + "grad_norm": 1.513096369347035, + "language_loss": 0.74209732, + "learning_rate": 3.652205856097318e-06, + "loss": 0.76447779, + "num_input_tokens_seen": 38066015, + "step": 1787, + "time_per_iteration": 2.612100124359131 + }, + { + "auxiliary_loss_clip": 0.01180835, + "auxiliary_loss_mlp": 0.00765009, + "balance_loss_clip": 1.05796349, + "balance_loss_mlp": 1.00116944, + "epoch": 0.21499428846269464, + "flos": 12673091583360.0, + "grad_norm": 1.8831345578204703, + "language_loss": 0.79402864, + "learning_rate": 3.651766765265856e-06, + "loss": 0.81348705, + "num_input_tokens_seen": 38083025, + "step": 1788, + "time_per_iteration": 2.5232691764831543 + }, + { + "auxiliary_loss_clip": 0.01180869, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.05589414, + "balance_loss_mlp": 1.01684487, + "epoch": 0.21511453135333372, + "flos": 23471573293440.0, + "grad_norm": 2.041857349656563, + "language_loss": 0.81488442, + "learning_rate": 3.65132742386269e-06, + "loss": 0.83695722, + "num_input_tokens_seen": 38098245, + "step": 1789, + "time_per_iteration": 2.5078086853027344 + }, + { + "auxiliary_loss_clip": 0.01214362, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.05942988, + "balance_loss_mlp": 1.02215934, + "epoch": 0.21523477424397283, + "flos": 26943560893440.0, + "grad_norm": 1.662498619104773, + "language_loss": 0.84748101, + "learning_rate": 3.6508878319544656e-06, + "loss": 0.86995089, + "num_input_tokens_seen": 38118460, + "step": 1790, + "time_per_iteration": 2.4993860721588135 + }, + { + "auxiliary_loss_clip": 0.0117633, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.05865979, + "balance_loss_mlp": 1.03489256, + "epoch": 0.21535501713461191, + "flos": 18916161719040.0, + "grad_norm": 2.563312368127213, + "language_loss": 0.81922615, + "learning_rate": 3.65044798960787e-06, + "loss": 0.84143949, + "num_input_tokens_seen": 38136800, + "step": 1791, + "time_per_iteration": 2.5290913581848145 + }, + { + "auxiliary_loss_clip": 0.01164793, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.05520153, + "balance_loss_mlp": 1.02253819, + "epoch": 0.215475260025251, + "flos": 17895113712000.0, + "grad_norm": 1.6737757317399367, + "language_loss": 0.78327453, + "learning_rate": 3.650007896889627e-06, + "loss": 0.80524218, + "num_input_tokens_seen": 38155380, + "step": 1792, + "time_per_iteration": 2.5355114936828613 + }, + { + "auxiliary_loss_clip": 0.01214479, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.06281829, + "balance_loss_mlp": 1.02914512, + "epoch": 0.2155955029158901, + "flos": 16654292340480.0, + "grad_norm": 2.1883727892734024, + "language_loss": 0.80680132, + "learning_rate": 3.6495675538664974e-06, + "loss": 0.82933658, + "num_input_tokens_seen": 38174395, + "step": 1793, + "time_per_iteration": 2.457888126373291 + }, + { + "auxiliary_loss_clip": 0.01185336, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.05595183, + "balance_loss_mlp": 1.02495694, + "epoch": 0.2157157458065292, + "flos": 23621213352960.0, + "grad_norm": 1.7575446136644943, + "language_loss": 0.82871157, + "learning_rate": 3.649126960605282e-06, + "loss": 0.85091251, + "num_input_tokens_seen": 38195380, + "step": 1794, + "time_per_iteration": 2.544405221939087 + }, + { + "auxiliary_loss_clip": 0.01182188, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.05798399, + "balance_loss_mlp": 1.02397752, + "epoch": 0.21583598869716827, + "flos": 22127078292480.0, + "grad_norm": 2.3241727635455027, + "language_loss": 0.83531022, + "learning_rate": 3.6486861171728174e-06, + "loss": 0.85746992, + "num_input_tokens_seen": 38213775, + "step": 1795, + "time_per_iteration": 2.516519546508789 + }, + { + "auxiliary_loss_clip": 0.01167489, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.05154955, + "balance_loss_mlp": 1.01995194, + "epoch": 0.21595623158780738, + "flos": 23441229279360.0, + "grad_norm": 1.6675853419057018, + "language_loss": 0.7864151, + "learning_rate": 3.6482450236359803e-06, + "loss": 0.80838692, + "num_input_tokens_seen": 38235630, + "step": 1796, + "time_per_iteration": 2.6114914417266846 + }, + { + "auxiliary_loss_clip": 0.01198251, + "auxiliary_loss_mlp": 0.01039043, + "balance_loss_clip": 1.06090307, + "balance_loss_mlp": 1.02982223, + "epoch": 0.21607647447844647, + "flos": 26906501036160.0, + "grad_norm": 2.642848341706185, + "language_loss": 0.77897894, + "learning_rate": 3.647803680061683e-06, + "loss": 0.80135179, + "num_input_tokens_seen": 38256045, + "step": 1797, + "time_per_iteration": 2.5336523056030273 + }, + { + "auxiliary_loss_clip": 0.01186294, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.05849743, + "balance_loss_mlp": 1.02542746, + "epoch": 0.21619671736908555, + "flos": 14495378319360.0, + "grad_norm": 2.4854600263610647, + "language_loss": 0.74615562, + "learning_rate": 3.6473620865168776e-06, + "loss": 0.76838207, + "num_input_tokens_seen": 38272915, + "step": 1798, + "time_per_iteration": 2.494514226913452 + }, + { + "auxiliary_loss_clip": 0.01184594, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.0608511, + "balance_loss_mlp": 1.02644014, + "epoch": 0.21631696025972463, + "flos": 17931096161280.0, + "grad_norm": 1.9309158295459796, + "language_loss": 0.81498837, + "learning_rate": 3.646920243068554e-06, + "loss": 0.83718991, + "num_input_tokens_seen": 38290810, + "step": 1799, + "time_per_iteration": 2.5032858848571777 + }, + { + "auxiliary_loss_clip": 0.01167867, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.05375743, + "balance_loss_mlp": 1.02534819, + "epoch": 0.21643720315036374, + "flos": 24462385027200.0, + "grad_norm": 1.5824022339318022, + "language_loss": 0.74510413, + "learning_rate": 3.6464781497837384e-06, + "loss": 0.76712847, + "num_input_tokens_seen": 38312785, + "step": 1800, + "time_per_iteration": 2.5370969772338867 + }, + { + "auxiliary_loss_clip": 0.0118437, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_clip": 1.05401719, + "balance_loss_mlp": 1.03638232, + "epoch": 0.21655744604100283, + "flos": 28474432588800.0, + "grad_norm": 1.8600297324915298, + "language_loss": 0.72813314, + "learning_rate": 3.6460358067294965e-06, + "loss": 0.75043434, + "num_input_tokens_seen": 38334015, + "step": 1801, + "time_per_iteration": 2.576519727706909 + }, + { + "auxiliary_loss_clip": 0.01216647, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.06006932, + "balance_loss_mlp": 1.02083635, + "epoch": 0.2166776889316419, + "flos": 20152960767360.0, + "grad_norm": 1.955194711043629, + "language_loss": 0.78240514, + "learning_rate": 3.645593213972932e-06, + "loss": 0.80487984, + "num_input_tokens_seen": 38352920, + "step": 1802, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01194934, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.05827296, + "balance_loss_mlp": 1.02185571, + "epoch": 0.21679793182228102, + "flos": 15193482122880.0, + "grad_norm": 2.605012971886413, + "language_loss": 0.79298192, + "learning_rate": 3.6451503715811852e-06, + "loss": 0.81525314, + "num_input_tokens_seen": 38371230, + "step": 1803, + "time_per_iteration": 2.4734320640563965 + }, + { + "auxiliary_loss_clip": 0.01182008, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.05973446, + "balance_loss_mlp": 1.0253222, + "epoch": 0.2169181747129201, + "flos": 17384464010880.0, + "grad_norm": 1.9668271057411717, + "language_loss": 0.80384636, + "learning_rate": 3.6447072796214345e-06, + "loss": 0.82600009, + "num_input_tokens_seen": 38389795, + "step": 1804, + "time_per_iteration": 2.4981882572174072 + }, + { + "auxiliary_loss_clip": 0.01065626, + "auxiliary_loss_mlp": 0.01005924, + "balance_loss_clip": 1.02624571, + "balance_loss_mlp": 1.00354028, + "epoch": 0.21703841760355919, + "flos": 58760955429120.0, + "grad_norm": 0.9554364486366573, + "language_loss": 0.63303971, + "learning_rate": 3.644263938160898e-06, + "loss": 0.65375525, + "num_input_tokens_seen": 38445760, + "step": 1805, + "time_per_iteration": 3.0463926792144775 + }, + { + "auxiliary_loss_clip": 0.01166202, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.05590272, + "balance_loss_mlp": 1.02158642, + "epoch": 0.21715866049419827, + "flos": 22418457419520.0, + "grad_norm": 1.8374340320280655, + "language_loss": 0.72122657, + "learning_rate": 3.6438203472668293e-06, + "loss": 0.74320513, + "num_input_tokens_seen": 38465405, + "step": 1806, + "time_per_iteration": 3.380117177963257 + }, + { + "auxiliary_loss_clip": 0.01186358, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.05735385, + "balance_loss_mlp": 1.0243547, + "epoch": 0.21727890338483738, + "flos": 17237732952960.0, + "grad_norm": 1.9070799122138213, + "language_loss": 0.82056355, + "learning_rate": 3.6433765070065206e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 38483195, + "step": 1807, + "time_per_iteration": 2.499148368835449 + }, + { + "auxiliary_loss_clip": 0.01213145, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.05952454, + "balance_loss_mlp": 1.02148652, + "epoch": 0.21739914627547646, + "flos": 13434792416640.0, + "grad_norm": 2.5394212463227146, + "language_loss": 0.87596506, + "learning_rate": 3.6429324174473025e-06, + "loss": 0.8984127, + "num_input_tokens_seen": 38496735, + "step": 1808, + "time_per_iteration": 2.4230000972747803 + }, + { + "auxiliary_loss_clip": 0.01199368, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.05679536, + "balance_loss_mlp": 1.02656329, + "epoch": 0.21751938916611555, + "flos": 20959514709120.0, + "grad_norm": 1.9673286826152745, + "language_loss": 0.84808159, + "learning_rate": 3.6424880786565425e-06, + "loss": 0.87043512, + "num_input_tokens_seen": 38512880, + "step": 1809, + "time_per_iteration": 2.478933095932007 + }, + { + "auxiliary_loss_clip": 0.01152118, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.05597627, + "balance_loss_mlp": 1.0272212, + "epoch": 0.21763963205675466, + "flos": 27599936071680.0, + "grad_norm": 2.2724983028377452, + "language_loss": 0.79833269, + "learning_rate": 3.6420434907016482e-06, + "loss": 0.82023209, + "num_input_tokens_seen": 38532570, + "step": 1810, + "time_per_iteration": 3.4623312950134277 + }, + { + "auxiliary_loss_clip": 0.01200193, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.06220508, + "balance_loss_mlp": 1.02466893, + "epoch": 0.21775987494739374, + "flos": 21430411032960.0, + "grad_norm": 1.5040179017112, + "language_loss": 0.8115114, + "learning_rate": 3.6415986536500606e-06, + "loss": 0.83385295, + "num_input_tokens_seen": 38550900, + "step": 1811, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01147597, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.05770361, + "balance_loss_mlp": 1.02853298, + "epoch": 0.21788011783803282, + "flos": 18332972501760.0, + "grad_norm": 6.0255268028168105, + "language_loss": 0.80824983, + "learning_rate": 3.641153567569263e-06, + "loss": 0.83011007, + "num_input_tokens_seen": 38569215, + "step": 1812, + "time_per_iteration": 3.4030940532684326 + }, + { + "auxiliary_loss_clip": 0.01194622, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.05840492, + "balance_loss_mlp": 1.01991379, + "epoch": 0.2180003607286719, + "flos": 30262748037120.0, + "grad_norm": 2.333828781563822, + "language_loss": 0.95938945, + "learning_rate": 3.640708232526774e-06, + "loss": 0.9816286, + "num_input_tokens_seen": 38587870, + "step": 1813, + "time_per_iteration": 3.4120779037475586 + }, + { + "auxiliary_loss_clip": 0.01132549, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.04485655, + "balance_loss_mlp": 1.02314019, + "epoch": 0.21812060361931102, + "flos": 25480272637440.0, + "grad_norm": 4.50554017108526, + "language_loss": 0.78354979, + "learning_rate": 3.6402626485901504e-06, + "loss": 0.80520976, + "num_input_tokens_seen": 38606965, + "step": 1814, + "time_per_iteration": 2.6621203422546387 + }, + { + "auxiliary_loss_clip": 0.01194347, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.06085503, + "balance_loss_mlp": 1.02885103, + "epoch": 0.2182408465099501, + "flos": 21908166854400.0, + "grad_norm": 1.935069136220931, + "language_loss": 0.77921939, + "learning_rate": 3.639816815826988e-06, + "loss": 0.80154437, + "num_input_tokens_seen": 38626290, + "step": 1815, + "time_per_iteration": 2.5036466121673584 + }, + { + "auxiliary_loss_clip": 0.01181054, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.0582943, + "balance_loss_mlp": 1.01971006, + "epoch": 0.21836108940058918, + "flos": 23657339456640.0, + "grad_norm": 1.7389176893223532, + "language_loss": 0.77971947, + "learning_rate": 3.6393707343049176e-06, + "loss": 0.80181956, + "num_input_tokens_seen": 38646620, + "step": 1816, + "time_per_iteration": 2.55485463142395 + }, + { + "auxiliary_loss_clip": 0.01199444, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.0571363, + "balance_loss_mlp": 1.02026582, + "epoch": 0.2184813322912283, + "flos": 24681009156480.0, + "grad_norm": 2.750800225248675, + "language_loss": 0.73751456, + "learning_rate": 3.6389244040916104e-06, + "loss": 0.75980377, + "num_input_tokens_seen": 38665695, + "step": 1817, + "time_per_iteration": 2.543424606323242 + }, + { + "auxiliary_loss_clip": 0.01175243, + "auxiliary_loss_mlp": 0.00765681, + "balance_loss_clip": 1.0555681, + "balance_loss_mlp": 1.00089371, + "epoch": 0.21860157518186737, + "flos": 26574650259840.0, + "grad_norm": 1.9858860385453883, + "language_loss": 0.79217559, + "learning_rate": 3.6384778252547747e-06, + "loss": 0.81158483, + "num_input_tokens_seen": 38681575, + "step": 1818, + "time_per_iteration": 2.546870231628418 + }, + { + "auxiliary_loss_clip": 0.01179903, + "auxiliary_loss_mlp": 0.00764651, + "balance_loss_clip": 1.06010067, + "balance_loss_mlp": 1.0009644, + "epoch": 0.21872181807250646, + "flos": 20886292834560.0, + "grad_norm": 2.633498003300965, + "language_loss": 0.78427494, + "learning_rate": 3.638030997862155e-06, + "loss": 0.80372047, + "num_input_tokens_seen": 38700510, + "step": 1819, + "time_per_iteration": 2.5029172897338867 + }, + { + "auxiliary_loss_clip": 0.01089882, + "auxiliary_loss_mlp": 0.01008806, + "balance_loss_clip": 1.02916503, + "balance_loss_mlp": 1.00705338, + "epoch": 0.21884206096314554, + "flos": 61209452897280.0, + "grad_norm": 0.7692780539040929, + "language_loss": 0.59501928, + "learning_rate": 3.6375839219815356e-06, + "loss": 0.61600614, + "num_input_tokens_seen": 38758310, + "step": 1820, + "time_per_iteration": 3.018749475479126 + }, + { + "auxiliary_loss_clip": 0.0121308, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.06106544, + "balance_loss_mlp": 1.02185249, + "epoch": 0.21896230385378465, + "flos": 23473835850240.0, + "grad_norm": 1.9937184411250723, + "language_loss": 0.82753086, + "learning_rate": 3.6371365976807375e-06, + "loss": 0.8499794, + "num_input_tokens_seen": 38778705, + "step": 1821, + "time_per_iteration": 2.4649085998535156 + }, + { + "auxiliary_loss_clip": 0.0114489, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.05391288, + "balance_loss_mlp": 1.02007675, + "epoch": 0.21908254674442373, + "flos": 25081915829760.0, + "grad_norm": 1.700915811783782, + "language_loss": 0.84071362, + "learning_rate": 3.6366890250276185e-06, + "loss": 0.86245799, + "num_input_tokens_seen": 38799660, + "step": 1822, + "time_per_iteration": 2.6104955673217773 + }, + { + "auxiliary_loss_clip": 0.01212311, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.06049335, + "balance_loss_mlp": 1.02105069, + "epoch": 0.21920278963506282, + "flos": 23513768795520.0, + "grad_norm": 1.8820216773477807, + "language_loss": 0.89986455, + "learning_rate": 3.6362412040900764e-06, + "loss": 0.9222942, + "num_input_tokens_seen": 38819450, + "step": 1823, + "time_per_iteration": 2.475553274154663 + }, + { + "auxiliary_loss_clip": 0.01199665, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.05791688, + "balance_loss_mlp": 1.02274978, + "epoch": 0.21932303252570193, + "flos": 29242238734080.0, + "grad_norm": 2.0366168150927515, + "language_loss": 0.80926651, + "learning_rate": 3.635793134936044e-06, + "loss": 0.83158791, + "num_input_tokens_seen": 38840460, + "step": 1824, + "time_per_iteration": 2.543588161468506 + }, + { + "auxiliary_loss_clip": 0.01196323, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.05972886, + "balance_loss_mlp": 1.02686214, + "epoch": 0.219443275416341, + "flos": 20806857907200.0, + "grad_norm": 2.055728139162839, + "language_loss": 0.73196483, + "learning_rate": 3.635344817633494e-06, + "loss": 0.75429189, + "num_input_tokens_seen": 38859775, + "step": 1825, + "time_per_iteration": 2.595139265060425 + }, + { + "auxiliary_loss_clip": 0.01192514, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.05707002, + "balance_loss_mlp": 1.02314222, + "epoch": 0.2195635183069801, + "flos": 14501555458560.0, + "grad_norm": 2.067874691579874, + "language_loss": 0.75541836, + "learning_rate": 3.634896252250436e-06, + "loss": 0.77767074, + "num_input_tokens_seen": 38876540, + "step": 1826, + "time_per_iteration": 2.4944369792938232 + }, + { + "auxiliary_loss_clip": 0.01217211, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.06392622, + "balance_loss_mlp": 1.03305995, + "epoch": 0.2196837611976192, + "flos": 24243473589120.0, + "grad_norm": 1.9677866078938475, + "language_loss": 0.82004106, + "learning_rate": 3.6344474388549157e-06, + "loss": 0.84263921, + "num_input_tokens_seen": 38896195, + "step": 1827, + "time_per_iteration": 2.4724934101104736 + }, + { + "auxiliary_loss_clip": 0.01203914, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.06372714, + "balance_loss_mlp": 1.0298661, + "epoch": 0.2198040040882583, + "flos": 18074523168000.0, + "grad_norm": 2.095655207348312, + "language_loss": 0.80106449, + "learning_rate": 3.6339983775150183e-06, + "loss": 0.82350862, + "num_input_tokens_seen": 38912755, + "step": 1828, + "time_per_iteration": 2.479191780090332 + }, + { + "auxiliary_loss_clip": 0.01195874, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.06006789, + "balance_loss_mlp": 1.02009618, + "epoch": 0.21992424697889737, + "flos": 17784185535360.0, + "grad_norm": 2.5605731210981166, + "language_loss": 0.83987868, + "learning_rate": 3.6335490682988664e-06, + "loss": 0.86213982, + "num_input_tokens_seen": 38928365, + "step": 1829, + "time_per_iteration": 2.4454495906829834 + }, + { + "auxiliary_loss_clip": 0.01130761, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.04988146, + "balance_loss_mlp": 1.02451229, + "epoch": 0.22004448986953645, + "flos": 17638495971840.0, + "grad_norm": 2.016747118862944, + "language_loss": 0.82983798, + "learning_rate": 3.63309951127462e-06, + "loss": 0.85148519, + "num_input_tokens_seen": 38945275, + "step": 1830, + "time_per_iteration": 2.604417324066162 + }, + { + "auxiliary_loss_clip": 0.0116918, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.05855632, + "balance_loss_mlp": 1.03133583, + "epoch": 0.22016473276017556, + "flos": 22275533203200.0, + "grad_norm": 2.07099195142222, + "language_loss": 0.75454307, + "learning_rate": 3.6326497065104757e-06, + "loss": 0.7766493, + "num_input_tokens_seen": 38965740, + "step": 1831, + "time_per_iteration": 2.576331377029419 + }, + { + "auxiliary_loss_clip": 0.01203776, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.06062341, + "balance_loss_mlp": 1.0234791, + "epoch": 0.22028497565081465, + "flos": 25556259859200.0, + "grad_norm": 1.962192022577612, + "language_loss": 0.77963573, + "learning_rate": 3.6321996540746697e-06, + "loss": 0.8020035, + "num_input_tokens_seen": 38984815, + "step": 1832, + "time_per_iteration": 3.343738555908203 + }, + { + "auxiliary_loss_clip": 0.01168782, + "auxiliary_loss_mlp": 0.01033061, + "balance_loss_clip": 1.05751157, + "balance_loss_mlp": 1.02288043, + "epoch": 0.22040521854145373, + "flos": 36247332925440.0, + "grad_norm": 1.7843214336478292, + "language_loss": 0.80798113, + "learning_rate": 3.6317493540354733e-06, + "loss": 0.82999957, + "num_input_tokens_seen": 39008230, + "step": 1833, + "time_per_iteration": 2.691223621368408 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.05848503, + "balance_loss_mlp": 1.02853072, + "epoch": 0.22052546143209284, + "flos": 11838420270720.0, + "grad_norm": 2.1075454989375526, + "language_loss": 0.76603472, + "learning_rate": 3.6312988064611976e-06, + "loss": 0.788378, + "num_input_tokens_seen": 39026540, + "step": 1834, + "time_per_iteration": 2.503697395324707 + }, + { + "auxiliary_loss_clip": 0.01170335, + "auxiliary_loss_mlp": 0.01037744, + "balance_loss_clip": 1.05327177, + "balance_loss_mlp": 1.02775407, + "epoch": 0.22064570432273192, + "flos": 24209250906240.0, + "grad_norm": 1.7980106022524054, + "language_loss": 0.81579375, + "learning_rate": 3.6308480114201896e-06, + "loss": 0.83787453, + "num_input_tokens_seen": 39048460, + "step": 1835, + "time_per_iteration": 2.586886167526245 + }, + { + "auxiliary_loss_clip": 0.01218161, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.06628919, + "balance_loss_mlp": 1.02968121, + "epoch": 0.220765947213371, + "flos": 17931347556480.0, + "grad_norm": 1.9866853062646816, + "language_loss": 0.76503837, + "learning_rate": 3.630396968980835e-06, + "loss": 0.78761774, + "num_input_tokens_seen": 39066335, + "step": 1836, + "time_per_iteration": 2.4501953125 + }, + { + "auxiliary_loss_clip": 0.0118653, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.05829692, + "balance_loss_mlp": 1.02668285, + "epoch": 0.2208861901040101, + "flos": 26757040544640.0, + "grad_norm": 2.4257323987347736, + "language_loss": 0.83769405, + "learning_rate": 3.6299456792115575e-06, + "loss": 0.85992903, + "num_input_tokens_seen": 39087590, + "step": 1837, + "time_per_iteration": 3.375005006790161 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.0461539, + "balance_loss_mlp": 1.02144611, + "epoch": 0.2210064329946492, + "flos": 17817977255040.0, + "grad_norm": 1.867138953688265, + "language_loss": 0.81080282, + "learning_rate": 3.629494142180815e-06, + "loss": 0.83221638, + "num_input_tokens_seen": 39106335, + "step": 1838, + "time_per_iteration": 3.5625452995300293 + }, + { + "auxiliary_loss_clip": 0.01214356, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.06248856, + "balance_loss_mlp": 1.02335024, + "epoch": 0.22112667588528828, + "flos": 17967401832960.0, + "grad_norm": 2.4137200429991235, + "language_loss": 0.8487736, + "learning_rate": 3.6290423579571075e-06, + "loss": 0.87124902, + "num_input_tokens_seen": 39122875, + "step": 1839, + "time_per_iteration": 2.429676055908203 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.06198418, + "balance_loss_mlp": 1.02320123, + "epoch": 0.22124691877592736, + "flos": 18369206346240.0, + "grad_norm": 1.5888362504762383, + "language_loss": 0.80414015, + "learning_rate": 3.6285903266089694e-06, + "loss": 0.8264513, + "num_input_tokens_seen": 39142150, + "step": 1840, + "time_per_iteration": 3.2666099071502686 + }, + { + "auxiliary_loss_clip": 0.01188929, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.06119275, + "balance_loss_mlp": 1.02119398, + "epoch": 0.22136716166656648, + "flos": 20813286441600.0, + "grad_norm": 2.277933352628288, + "language_loss": 0.77550155, + "learning_rate": 3.628138048204974e-06, + "loss": 0.79770124, + "num_input_tokens_seen": 39162835, + "step": 1841, + "time_per_iteration": 2.5117993354797363 + }, + { + "auxiliary_loss_clip": 0.01147157, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.05471373, + "balance_loss_mlp": 1.02466178, + "epoch": 0.22148740455720556, + "flos": 17675699483520.0, + "grad_norm": 1.7526688298215332, + "language_loss": 0.75941789, + "learning_rate": 3.6276855228137304e-06, + "loss": 0.78124404, + "num_input_tokens_seen": 39181040, + "step": 1842, + "time_per_iteration": 2.5626485347747803 + }, + { + "auxiliary_loss_clip": 0.01215864, + "auxiliary_loss_mlp": 0.00765363, + "balance_loss_clip": 1.06377769, + "balance_loss_mlp": 1.0009048, + "epoch": 0.22160764744784464, + "flos": 21726710323200.0, + "grad_norm": 2.1723645940308165, + "language_loss": 0.81869686, + "learning_rate": 3.6272327505038874e-06, + "loss": 0.83850914, + "num_input_tokens_seen": 39197505, + "step": 1843, + "time_per_iteration": 2.474837303161621 + }, + { + "auxiliary_loss_clip": 0.01158895, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0536046, + "balance_loss_mlp": 1.02705932, + "epoch": 0.22172789033848372, + "flos": 23764712186880.0, + "grad_norm": 2.0657982927518783, + "language_loss": 0.78608894, + "learning_rate": 3.626779731344131e-06, + "loss": 0.80804026, + "num_input_tokens_seen": 39217295, + "step": 1844, + "time_per_iteration": 2.614419460296631 + }, + { + "auxiliary_loss_clip": 0.01209412, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.06075335, + "balance_loss_mlp": 1.03011298, + "epoch": 0.22184813322912283, + "flos": 16982300361600.0, + "grad_norm": 1.8979379168343127, + "language_loss": 0.85162294, + "learning_rate": 3.6263264654031814e-06, + "loss": 0.87411118, + "num_input_tokens_seen": 39234195, + "step": 1845, + "time_per_iteration": 2.429943323135376 + }, + { + "auxiliary_loss_clip": 0.0107815, + "auxiliary_loss_mlp": 0.01002226, + "balance_loss_clip": 1.02922654, + "balance_loss_mlp": 1.00043821, + "epoch": 0.22196837611976192, + "flos": 61823740314240.0, + "grad_norm": 0.7080897769965405, + "language_loss": 0.59222591, + "learning_rate": 3.6258729527498008e-06, + "loss": 0.61302972, + "num_input_tokens_seen": 39295040, + "step": 1846, + "time_per_iteration": 3.091163396835327 + }, + { + "auxiliary_loss_clip": 0.01191191, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.06260633, + "balance_loss_mlp": 1.02148342, + "epoch": 0.222088619010401, + "flos": 25558019625600.0, + "grad_norm": 2.234913187081375, + "language_loss": 0.64814746, + "learning_rate": 3.6254191934527854e-06, + "loss": 0.6703698, + "num_input_tokens_seen": 39314395, + "step": 1847, + "time_per_iteration": 2.6546897888183594 + }, + { + "auxiliary_loss_clip": 0.0116668, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.05999601, + "balance_loss_mlp": 1.02168822, + "epoch": 0.2222088619010401, + "flos": 19318612677120.0, + "grad_norm": 1.9880423692090115, + "language_loss": 0.65021938, + "learning_rate": 3.6249651875809715e-06, + "loss": 0.67220646, + "num_input_tokens_seen": 39334275, + "step": 1848, + "time_per_iteration": 2.595595598220825 + }, + { + "auxiliary_loss_clip": 0.01179336, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.06053615, + "balance_loss_mlp": 1.02221775, + "epoch": 0.2223291047916792, + "flos": 19099342103040.0, + "grad_norm": 1.8900897185309564, + "language_loss": 0.89523858, + "learning_rate": 3.62451093520323e-06, + "loss": 0.91734886, + "num_input_tokens_seen": 39352180, + "step": 1849, + "time_per_iteration": 2.5012753009796143 + }, + { + "auxiliary_loss_clip": 0.01151103, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_clip": 1.0532589, + "balance_loss_mlp": 1.0345273, + "epoch": 0.22244934768231828, + "flos": 20850418126080.0, + "grad_norm": 2.192864299218851, + "language_loss": 0.90520245, + "learning_rate": 3.6240564363884714e-06, + "loss": 0.9271521, + "num_input_tokens_seen": 39372125, + "step": 1850, + "time_per_iteration": 2.60660982131958 + }, + { + "auxiliary_loss_clip": 0.01201212, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.05873311, + "balance_loss_mlp": 1.02897382, + "epoch": 0.2225695905729574, + "flos": 15632921111040.0, + "grad_norm": 2.1501431983204053, + "language_loss": 0.70448303, + "learning_rate": 3.623601691205643e-06, + "loss": 0.72688192, + "num_input_tokens_seen": 39391200, + "step": 1851, + "time_per_iteration": 2.502091646194458 + }, + { + "auxiliary_loss_clip": 0.01196985, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.05827117, + "balance_loss_mlp": 1.02013314, + "epoch": 0.22268983346359647, + "flos": 25373582265600.0, + "grad_norm": 2.0433813144031587, + "language_loss": 0.81309092, + "learning_rate": 3.623146699723729e-06, + "loss": 0.83535391, + "num_input_tokens_seen": 39410660, + "step": 1852, + "time_per_iteration": 2.553537368774414 + }, + { + "auxiliary_loss_clip": 0.01186827, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.06475699, + "balance_loss_mlp": 1.02832437, + "epoch": 0.22281007635423555, + "flos": 13261452359040.0, + "grad_norm": 1.7434932059917718, + "language_loss": 0.77765834, + "learning_rate": 3.6226914620117507e-06, + "loss": 0.7999056, + "num_input_tokens_seen": 39429280, + "step": 1853, + "time_per_iteration": 2.5166845321655273 + }, + { + "auxiliary_loss_clip": 0.01171565, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.05404162, + "balance_loss_mlp": 1.02259493, + "epoch": 0.22293031924487464, + "flos": 15340536403200.0, + "grad_norm": 2.448947588991763, + "language_loss": 0.81192482, + "learning_rate": 3.622235978138768e-06, + "loss": 0.8339591, + "num_input_tokens_seen": 39446905, + "step": 1854, + "time_per_iteration": 2.531911849975586 + }, + { + "auxiliary_loss_clip": 0.01200147, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.06383705, + "balance_loss_mlp": 1.02135217, + "epoch": 0.22305056213551375, + "flos": 22564649773440.0, + "grad_norm": 1.8666837020135476, + "language_loss": 0.81472456, + "learning_rate": 3.621780248173877e-06, + "loss": 0.83703429, + "num_input_tokens_seen": 39465105, + "step": 1855, + "time_per_iteration": 2.485137701034546 + }, + { + "auxiliary_loss_clip": 0.01105582, + "auxiliary_loss_mlp": 0.01004052, + "balance_loss_clip": 1.028512, + "balance_loss_mlp": 1.00253248, + "epoch": 0.22317080502615283, + "flos": 64880419887360.0, + "grad_norm": 0.8282101929430018, + "language_loss": 0.61029661, + "learning_rate": 3.6213242721862125e-06, + "loss": 0.63139296, + "num_input_tokens_seen": 39523560, + "step": 1856, + "time_per_iteration": 3.068829298019409 + }, + { + "auxiliary_loss_clip": 0.01173778, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.0582335, + "balance_loss_mlp": 1.02625108, + "epoch": 0.2232910479167919, + "flos": 25775997310080.0, + "grad_norm": 1.588392971160899, + "language_loss": 0.75256741, + "learning_rate": 3.620868050244945e-06, + "loss": 0.77466452, + "num_input_tokens_seen": 39544040, + "step": 1857, + "time_per_iteration": 2.5508782863616943 + }, + { + "auxiliary_loss_clip": 0.01178659, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.05777502, + "balance_loss_mlp": 1.02074552, + "epoch": 0.22341129080743102, + "flos": 23251799928960.0, + "grad_norm": 1.7773498125523781, + "language_loss": 0.7767967, + "learning_rate": 3.6204115824192817e-06, + "loss": 0.79888856, + "num_input_tokens_seen": 39561515, + "step": 1858, + "time_per_iteration": 2.5358359813690186 + }, + { + "auxiliary_loss_clip": 0.01173929, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.05437028, + "balance_loss_mlp": 1.02241302, + "epoch": 0.2235315336980701, + "flos": 21214552250880.0, + "grad_norm": 2.57898043621557, + "language_loss": 0.76530206, + "learning_rate": 3.619954868778471e-06, + "loss": 0.78736651, + "num_input_tokens_seen": 39578210, + "step": 1859, + "time_per_iteration": 3.29205584526062 + }, + { + "auxiliary_loss_clip": 0.01183414, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.05726385, + "balance_loss_mlp": 1.02543473, + "epoch": 0.2236517765887092, + "flos": 19901945548800.0, + "grad_norm": 1.922907339410487, + "language_loss": 0.82973999, + "learning_rate": 3.6194979093917944e-06, + "loss": 0.85191673, + "num_input_tokens_seen": 39597625, + "step": 1860, + "time_per_iteration": 2.516542673110962 + }, + { + "auxiliary_loss_clip": 0.01179317, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.05759752, + "balance_loss_mlp": 1.02783465, + "epoch": 0.22377201947934827, + "flos": 23214847812480.0, + "grad_norm": 1.8880782064336612, + "language_loss": 0.87007153, + "learning_rate": 3.6190407043285724e-06, + "loss": 0.89223617, + "num_input_tokens_seen": 39615360, + "step": 1861, + "time_per_iteration": 2.525892734527588 + }, + { + "auxiliary_loss_clip": 0.01216472, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.06313014, + "balance_loss_mlp": 1.02469587, + "epoch": 0.22389226236998738, + "flos": 26794244056320.0, + "grad_norm": 1.8270058990913984, + "language_loss": 0.75750405, + "learning_rate": 3.618583253658163e-06, + "loss": 0.78001392, + "num_input_tokens_seen": 39635460, + "step": 1862, + "time_per_iteration": 2.4903433322906494 + }, + { + "auxiliary_loss_clip": 0.0115348, + "auxiliary_loss_mlp": 0.0076543, + "balance_loss_clip": 1.05478442, + "balance_loss_mlp": 1.00098884, + "epoch": 0.22401250526062647, + "flos": 24170359455360.0, + "grad_norm": 2.0531138651682936, + "language_loss": 0.86383408, + "learning_rate": 3.618125557449961e-06, + "loss": 0.88302314, + "num_input_tokens_seen": 39653515, + "step": 1863, + "time_per_iteration": 2.628727436065674 + }, + { + "auxiliary_loss_clip": 0.01195902, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.06027055, + "balance_loss_mlp": 1.02274323, + "epoch": 0.22413274815126555, + "flos": 16759761649920.0, + "grad_norm": 1.9826821231827214, + "language_loss": 0.83092076, + "learning_rate": 3.6176676157733983e-06, + "loss": 0.85320383, + "num_input_tokens_seen": 39668525, + "step": 1864, + "time_per_iteration": 3.395414113998413 + }, + { + "auxiliary_loss_clip": 0.01163012, + "auxiliary_loss_mlp": 0.0103444, + "balance_loss_clip": 1.0552299, + "balance_loss_mlp": 1.02475476, + "epoch": 0.22425299104190466, + "flos": 21360205900800.0, + "grad_norm": 1.9861608042584011, + "language_loss": 0.75771934, + "learning_rate": 3.6172094286979443e-06, + "loss": 0.77969384, + "num_input_tokens_seen": 39685895, + "step": 1865, + "time_per_iteration": 2.538931131362915 + }, + { + "auxiliary_loss_clip": 0.01180527, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.05447745, + "balance_loss_mlp": 1.02385747, + "epoch": 0.22437323393254374, + "flos": 32165547108480.0, + "grad_norm": 1.4542573198902833, + "language_loss": 0.81347823, + "learning_rate": 3.6167509962931064e-06, + "loss": 0.83561677, + "num_input_tokens_seen": 39711595, + "step": 1866, + "time_per_iteration": 3.361229181289673 + }, + { + "auxiliary_loss_clip": 0.011636, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.05907941, + "balance_loss_mlp": 1.02331305, + "epoch": 0.22449347682318282, + "flos": 18002809664640.0, + "grad_norm": 2.425195390624701, + "language_loss": 0.77168375, + "learning_rate": 3.6162923186284276e-06, + "loss": 0.79365057, + "num_input_tokens_seen": 39727555, + "step": 1867, + "time_per_iteration": 2.542787790298462 + }, + { + "auxiliary_loss_clip": 0.01184134, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.05755353, + "balance_loss_mlp": 1.02940118, + "epoch": 0.2246137197138219, + "flos": 18697286194560.0, + "grad_norm": 1.871388549069376, + "language_loss": 0.85816681, + "learning_rate": 3.6158333957734888e-06, + "loss": 0.88039809, + "num_input_tokens_seen": 39746145, + "step": 1868, + "time_per_iteration": 2.494091272354126 + }, + { + "auxiliary_loss_clip": 0.01169593, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.0536294, + "balance_loss_mlp": 1.02390838, + "epoch": 0.22473396260446102, + "flos": 15590653781760.0, + "grad_norm": 2.5066638802733663, + "language_loss": 0.82652211, + "learning_rate": 3.6153742277979088e-06, + "loss": 0.84855193, + "num_input_tokens_seen": 39763575, + "step": 1869, + "time_per_iteration": 2.527780055999756 + }, + { + "auxiliary_loss_clip": 0.01186785, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.05839658, + "balance_loss_mlp": 1.02910352, + "epoch": 0.2248542054951001, + "flos": 14465501182080.0, + "grad_norm": 1.8966546830711624, + "language_loss": 0.78301626, + "learning_rate": 3.6149148147713434e-06, + "loss": 0.8052696, + "num_input_tokens_seen": 39781810, + "step": 1870, + "time_per_iteration": 2.492715358734131 + }, + { + "auxiliary_loss_clip": 0.01205259, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.06516886, + "balance_loss_mlp": 1.02749372, + "epoch": 0.22497444838573918, + "flos": 19243882431360.0, + "grad_norm": 1.948123388163964, + "language_loss": 0.86649078, + "learning_rate": 3.614455156763484e-06, + "loss": 0.8889091, + "num_input_tokens_seen": 39800115, + "step": 1871, + "time_per_iteration": 2.4802510738372803 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.0484823, + "balance_loss_mlp": 1.02514207, + "epoch": 0.2250946912763783, + "flos": 16910299549440.0, + "grad_norm": 1.9783770824194185, + "language_loss": 0.71753198, + "learning_rate": 3.613995253844061e-06, + "loss": 0.73935157, + "num_input_tokens_seen": 39817795, + "step": 1872, + "time_per_iteration": 2.5490310192108154 + }, + { + "auxiliary_loss_clip": 0.011987, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.06279373, + "balance_loss_mlp": 1.02771521, + "epoch": 0.22521493416701738, + "flos": 24681368292480.0, + "grad_norm": 1.8430744053092842, + "language_loss": 0.81225878, + "learning_rate": 3.6135351060828414e-06, + "loss": 0.83461916, + "num_input_tokens_seen": 39838270, + "step": 1873, + "time_per_iteration": 2.5368502140045166 + }, + { + "auxiliary_loss_clip": 0.01220939, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.06565356, + "balance_loss_mlp": 1.0313648, + "epoch": 0.22533517705765646, + "flos": 17821963664640.0, + "grad_norm": 2.4485399220804536, + "language_loss": 0.69037896, + "learning_rate": 3.6130747135496285e-06, + "loss": 0.71300411, + "num_input_tokens_seen": 39857270, + "step": 1874, + "time_per_iteration": 2.5364906787872314 + }, + { + "auxiliary_loss_clip": 0.0121332, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.06186152, + "balance_loss_mlp": 1.02355158, + "epoch": 0.22545541994829554, + "flos": 33691390899840.0, + "grad_norm": 1.912430356470123, + "language_loss": 0.65819371, + "learning_rate": 3.6126140763142646e-06, + "loss": 0.68066162, + "num_input_tokens_seen": 39882300, + "step": 1875, + "time_per_iteration": 2.5601909160614014 + }, + { + "auxiliary_loss_clip": 0.01215015, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.06363487, + "balance_loss_mlp": 1.02875769, + "epoch": 0.22557566283893465, + "flos": 19171594310400.0, + "grad_norm": 2.4741014409101187, + "language_loss": 0.85806561, + "learning_rate": 3.6121531944466275e-06, + "loss": 0.88060653, + "num_input_tokens_seen": 39899625, + "step": 1876, + "time_per_iteration": 2.439235210418701 + }, + { + "auxiliary_loss_clip": 0.01195966, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.06039047, + "balance_loss_mlp": 1.02843761, + "epoch": 0.22569590572957374, + "flos": 20773281669120.0, + "grad_norm": 2.0766529801959623, + "language_loss": 0.78550196, + "learning_rate": 3.611692068016633e-06, + "loss": 0.80783719, + "num_input_tokens_seen": 39915955, + "step": 1877, + "time_per_iteration": 2.5173697471618652 + }, + { + "auxiliary_loss_clip": 0.01160749, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.05057383, + "balance_loss_mlp": 1.03011644, + "epoch": 0.22581614862021282, + "flos": 18442715529600.0, + "grad_norm": 2.0774131567164624, + "language_loss": 0.74898779, + "learning_rate": 3.611230697094233e-06, + "loss": 0.77100205, + "num_input_tokens_seen": 39932655, + "step": 1878, + "time_per_iteration": 2.56384015083313 + }, + { + "auxiliary_loss_clip": 0.01185986, + "auxiliary_loss_mlp": 0.01037462, + "balance_loss_clip": 1.05694747, + "balance_loss_mlp": 1.02830684, + "epoch": 0.22593639151085193, + "flos": 20048389297920.0, + "grad_norm": 1.7982894101729847, + "language_loss": 0.86887002, + "learning_rate": 3.6107690817494173e-06, + "loss": 0.89110446, + "num_input_tokens_seen": 39952875, + "step": 1879, + "time_per_iteration": 2.598787307739258 + }, + { + "auxiliary_loss_clip": 0.01148995, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.05222559, + "balance_loss_mlp": 1.02327812, + "epoch": 0.226056634401491, + "flos": 13115116350720.0, + "grad_norm": 2.589123386647954, + "language_loss": 0.70996881, + "learning_rate": 3.6103072220522117e-06, + "loss": 0.73178852, + "num_input_tokens_seen": 39968405, + "step": 1880, + "time_per_iteration": 2.5793960094451904 + }, + { + "auxiliary_loss_clip": 0.01172021, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.05532455, + "balance_loss_mlp": 1.02825689, + "epoch": 0.2261768772921301, + "flos": 18988378012800.0, + "grad_norm": 1.8618163450802068, + "language_loss": 0.91884744, + "learning_rate": 3.609845118072682e-06, + "loss": 0.94094574, + "num_input_tokens_seen": 39987075, + "step": 1881, + "time_per_iteration": 2.562549352645874 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.00765513, + "balance_loss_clip": 1.06053567, + "balance_loss_mlp": 1.00106442, + "epoch": 0.2262971201827692, + "flos": 19974054101760.0, + "grad_norm": 1.8250034757698619, + "language_loss": 0.79947442, + "learning_rate": 3.6093827698809276e-06, + "loss": 0.81917, + "num_input_tokens_seen": 40006175, + "step": 1882, + "time_per_iteration": 2.489743947982788 + }, + { + "auxiliary_loss_clip": 0.0119492, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.0564754, + "balance_loss_mlp": 1.02767777, + "epoch": 0.2264173630734083, + "flos": 16654543735680.0, + "grad_norm": 2.136312271798255, + "language_loss": 0.84983349, + "learning_rate": 3.6089201775470864e-06, + "loss": 0.87215436, + "num_input_tokens_seen": 40021630, + "step": 1883, + "time_per_iteration": 2.4470057487487793 + }, + { + "auxiliary_loss_clip": 0.01158182, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.05583787, + "balance_loss_mlp": 1.02687633, + "epoch": 0.22653760596404737, + "flos": 24389809597440.0, + "grad_norm": 1.3768009123846234, + "language_loss": 0.77395582, + "learning_rate": 3.6084573411413334e-06, + "loss": 0.79590416, + "num_input_tokens_seen": 40041025, + "step": 1884, + "time_per_iteration": 2.5785439014434814 + }, + { + "auxiliary_loss_clip": 0.01169279, + "auxiliary_loss_mlp": 0.01040842, + "balance_loss_clip": 1.05641735, + "balance_loss_mlp": 1.03001213, + "epoch": 0.22665784885468646, + "flos": 18332541538560.0, + "grad_norm": 1.9760189842238627, + "language_loss": 0.80994856, + "learning_rate": 3.607994260733881e-06, + "loss": 0.83204985, + "num_input_tokens_seen": 40060265, + "step": 1885, + "time_per_iteration": 3.3323817253112793 + }, + { + "auxiliary_loss_clip": 0.01187462, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.05670381, + "balance_loss_mlp": 1.02208531, + "epoch": 0.22677809174532557, + "flos": 24058102475520.0, + "grad_norm": 1.634145050080432, + "language_loss": 0.74713755, + "learning_rate": 3.6075309363949776e-06, + "loss": 0.76932299, + "num_input_tokens_seen": 40079435, + "step": 1886, + "time_per_iteration": 2.514387845993042 + }, + { + "auxiliary_loss_clip": 0.01213582, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.06082606, + "balance_loss_mlp": 1.02393341, + "epoch": 0.22689833463596465, + "flos": 20374242503040.0, + "grad_norm": 2.282287208111359, + "language_loss": 0.81096125, + "learning_rate": 3.6070673681949094e-06, + "loss": 0.83343434, + "num_input_tokens_seen": 40097800, + "step": 1887, + "time_per_iteration": 2.4556827545166016 + }, + { + "auxiliary_loss_clip": 0.01185518, + "auxiliary_loss_mlp": 0.00765134, + "balance_loss_clip": 1.0596745, + "balance_loss_mlp": 1.00110686, + "epoch": 0.22701857752660373, + "flos": 30120398438400.0, + "grad_norm": 1.8463247542872159, + "language_loss": 0.8146311, + "learning_rate": 3.606603556203999e-06, + "loss": 0.83413768, + "num_input_tokens_seen": 40122745, + "step": 1888, + "time_per_iteration": 2.601454973220825 + }, + { + "auxiliary_loss_clip": 0.01197506, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.05647182, + "balance_loss_mlp": 1.02602863, + "epoch": 0.22713882041724284, + "flos": 22492182084480.0, + "grad_norm": 1.811119368682901, + "language_loss": 0.83463424, + "learning_rate": 3.6061395004926066e-06, + "loss": 0.85696459, + "num_input_tokens_seen": 40141680, + "step": 1889, + "time_per_iteration": 2.480898141860962 + }, + { + "auxiliary_loss_clip": 0.01213199, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.0600574, + "balance_loss_mlp": 1.024544, + "epoch": 0.22725906330788193, + "flos": 20521548178560.0, + "grad_norm": 1.9479741659128826, + "language_loss": 0.84772599, + "learning_rate": 3.605675201131129e-06, + "loss": 0.87019956, + "num_input_tokens_seen": 40160140, + "step": 1890, + "time_per_iteration": 2.4434306621551514 + }, + { + "auxiliary_loss_clip": 0.01206992, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.02487016, + "epoch": 0.227379306198521, + "flos": 18989922297600.0, + "grad_norm": 2.244446726712968, + "language_loss": 0.79414457, + "learning_rate": 3.60521065819e-06, + "loss": 0.81655288, + "num_input_tokens_seen": 40177450, + "step": 1891, + "time_per_iteration": 3.3893744945526123 + }, + { + "auxiliary_loss_clip": 0.01186334, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.05716181, + "balance_loss_mlp": 1.02424002, + "epoch": 0.2274995490891601, + "flos": 21798351999360.0, + "grad_norm": 1.7231169556552874, + "language_loss": 0.87688828, + "learning_rate": 3.60474587173969e-06, + "loss": 0.89908016, + "num_input_tokens_seen": 40195935, + "step": 1892, + "time_per_iteration": 3.260382890701294 + }, + { + "auxiliary_loss_clip": 0.01197128, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.0621841, + "balance_loss_mlp": 1.02658868, + "epoch": 0.2276197919797992, + "flos": 19058654972160.0, + "grad_norm": 1.9786237873562795, + "language_loss": 0.84173048, + "learning_rate": 3.6042808418507084e-06, + "loss": 0.86406171, + "num_input_tokens_seen": 40213620, + "step": 1893, + "time_per_iteration": 2.4510369300842285 + }, + { + "auxiliary_loss_clip": 0.01200645, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.062029, + "balance_loss_mlp": 1.02753949, + "epoch": 0.22774003487043828, + "flos": 18806777827200.0, + "grad_norm": 3.832560721133269, + "language_loss": 0.76831335, + "learning_rate": 3.6038155685935976e-06, + "loss": 0.79069322, + "num_input_tokens_seen": 40230190, + "step": 1894, + "time_per_iteration": 2.454312562942505 + }, + { + "auxiliary_loss_clip": 0.01195333, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.05904293, + "balance_loss_mlp": 1.02455401, + "epoch": 0.22786027776107737, + "flos": 23002544476800.0, + "grad_norm": 1.8478021553382742, + "language_loss": 0.70841503, + "learning_rate": 3.6033500520389404e-06, + "loss": 0.73070323, + "num_input_tokens_seen": 40246860, + "step": 1895, + "time_per_iteration": 2.4760398864746094 + }, + { + "auxiliary_loss_clip": 0.01070673, + "auxiliary_loss_mlp": 0.01006922, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.00509787, + "epoch": 0.22798052065171648, + "flos": 66706872600960.0, + "grad_norm": 0.8040780334635551, + "language_loss": 0.64823902, + "learning_rate": 3.6028842922573553e-06, + "loss": 0.66901493, + "num_input_tokens_seen": 40311005, + "step": 1896, + "time_per_iteration": 3.206712245941162 + }, + { + "auxiliary_loss_clip": 0.01090266, + "auxiliary_loss_mlp": 0.00754752, + "balance_loss_clip": 1.03051019, + "balance_loss_mlp": 1.00066173, + "epoch": 0.22810076354235556, + "flos": 62080896758400.0, + "grad_norm": 0.8509710157333448, + "language_loss": 0.62913644, + "learning_rate": 3.602418289319497e-06, + "loss": 0.64758658, + "num_input_tokens_seen": 40369560, + "step": 1897, + "time_per_iteration": 3.1072418689727783 + }, + { + "auxiliary_loss_clip": 0.01149405, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.05266404, + "balance_loss_mlp": 1.02444577, + "epoch": 0.22822100643299464, + "flos": 23876358635520.0, + "grad_norm": 1.9594700156480302, + "language_loss": 0.73504794, + "learning_rate": 3.601952043296059e-06, + "loss": 0.75688434, + "num_input_tokens_seen": 40389555, + "step": 1898, + "time_per_iteration": 2.6265969276428223 + }, + { + "auxiliary_loss_clip": 0.01187299, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.05607986, + "balance_loss_mlp": 1.02270961, + "epoch": 0.22834124932363373, + "flos": 20991331180800.0, + "grad_norm": 2.209433595094319, + "language_loss": 0.80496663, + "learning_rate": 3.6014855542577696e-06, + "loss": 0.82716238, + "num_input_tokens_seen": 40406765, + "step": 1899, + "time_per_iteration": 2.5008046627044678 + }, + { + "auxiliary_loss_clip": 0.01182195, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.05792546, + "balance_loss_mlp": 1.01908135, + "epoch": 0.22846149221427284, + "flos": 24901572620160.0, + "grad_norm": 2.22064301455328, + "language_loss": 0.84333611, + "learning_rate": 3.6010188222753943e-06, + "loss": 0.86544639, + "num_input_tokens_seen": 40427535, + "step": 1900, + "time_per_iteration": 2.5809316635131836 + }, + { + "auxiliary_loss_clip": 0.01091189, + "auxiliary_loss_mlp": 0.01011563, + "balance_loss_clip": 1.02984834, + "balance_loss_mlp": 1.00984609, + "epoch": 0.22858173510491192, + "flos": 56132294319360.0, + "grad_norm": 0.9047666995725748, + "language_loss": 0.64161056, + "learning_rate": 3.6005518474197372e-06, + "loss": 0.66263807, + "num_input_tokens_seen": 40479580, + "step": 1901, + "time_per_iteration": 2.9828567504882812 + }, + { + "auxiliary_loss_clip": 0.0119795, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.06150997, + "balance_loss_mlp": 1.02176785, + "epoch": 0.228701977995551, + "flos": 24170826332160.0, + "grad_norm": 2.096582661508518, + "language_loss": 0.78212988, + "learning_rate": 3.6000846297616373e-06, + "loss": 0.80442476, + "num_input_tokens_seen": 40497880, + "step": 1902, + "time_per_iteration": 2.5138919353485107 + }, + { + "auxiliary_loss_clip": 0.01217016, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.06371641, + "balance_loss_mlp": 1.02481353, + "epoch": 0.22882222088619011, + "flos": 21387892308480.0, + "grad_norm": 2.208548764161951, + "language_loss": 0.72412646, + "learning_rate": 3.5996171693719717e-06, + "loss": 0.74664783, + "num_input_tokens_seen": 40513975, + "step": 1903, + "time_per_iteration": 2.4436137676239014 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01005274, + "balance_loss_clip": 1.03206694, + "balance_loss_mlp": 1.00360513, + "epoch": 0.2289424637768292, + "flos": 64589615377920.0, + "grad_norm": 0.8396527258145293, + "language_loss": 0.64781499, + "learning_rate": 3.5991494663216528e-06, + "loss": 0.66896355, + "num_input_tokens_seen": 40576960, + "step": 1904, + "time_per_iteration": 3.1305947303771973 + }, + { + "auxiliary_loss_clip": 0.01213654, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.06340075, + "balance_loss_mlp": 1.02005911, + "epoch": 0.22906270666746828, + "flos": 22163419877760.0, + "grad_norm": 3.14637454918225, + "language_loss": 0.87503016, + "learning_rate": 3.5986815206816314e-06, + "loss": 0.89746398, + "num_input_tokens_seen": 40595780, + "step": 1905, + "time_per_iteration": 2.4825522899627686 + }, + { + "auxiliary_loss_clip": 0.01212803, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.06230724, + "balance_loss_mlp": 1.02745128, + "epoch": 0.2291829495581074, + "flos": 25772334122880.0, + "grad_norm": 1.974033283095527, + "language_loss": 0.74598277, + "learning_rate": 3.598213332522895e-06, + "loss": 0.76847559, + "num_input_tokens_seen": 40615810, + "step": 1906, + "time_per_iteration": 2.4877636432647705 + }, + { + "auxiliary_loss_clip": 0.01195163, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.05830097, + "balance_loss_mlp": 1.0232898, + "epoch": 0.22930319244874647, + "flos": 31172760126720.0, + "grad_norm": 1.821359731588867, + "language_loss": 0.77216184, + "learning_rate": 3.597744901916466e-06, + "loss": 0.7944442, + "num_input_tokens_seen": 40637095, + "step": 1907, + "time_per_iteration": 2.561657190322876 + }, + { + "auxiliary_loss_clip": 0.01215777, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.06053555, + "balance_loss_mlp": 1.02530122, + "epoch": 0.22942343533938556, + "flos": 23254098399360.0, + "grad_norm": 3.3067372795388024, + "language_loss": 0.77003586, + "learning_rate": 3.5972762289334058e-06, + "loss": 0.79254675, + "num_input_tokens_seen": 40656725, + "step": 1908, + "time_per_iteration": 2.472668409347534 + }, + { + "auxiliary_loss_clip": 0.01135152, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.05400538, + "balance_loss_mlp": 1.01947498, + "epoch": 0.22954367823002464, + "flos": 14610903436800.0, + "grad_norm": 2.24948213431362, + "language_loss": 0.84914607, + "learning_rate": 3.5968073136448116e-06, + "loss": 0.87079358, + "num_input_tokens_seen": 40674745, + "step": 1909, + "time_per_iteration": 2.600472927093506 + }, + { + "auxiliary_loss_clip": 0.01202628, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.06018329, + "balance_loss_mlp": 1.02745259, + "epoch": 0.22966392112066375, + "flos": 16763604405120.0, + "grad_norm": 1.7186533335638219, + "language_loss": 0.91463274, + "learning_rate": 3.596338156121818e-06, + "loss": 0.93703842, + "num_input_tokens_seen": 40693630, + "step": 1910, + "time_per_iteration": 2.455132007598877 + }, + { + "auxiliary_loss_clip": 0.0109297, + "auxiliary_loss_mlp": 0.01001664, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 0.99997121, + "epoch": 0.22978416401130283, + "flos": 67474247783040.0, + "grad_norm": 0.7406317435003689, + "language_loss": 0.59394455, + "learning_rate": 3.595868756435595e-06, + "loss": 0.61489081, + "num_input_tokens_seen": 40761310, + "step": 1911, + "time_per_iteration": 4.0456647872924805 + }, + { + "auxiliary_loss_clip": 0.01173517, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.06183112, + "balance_loss_mlp": 1.02435338, + "epoch": 0.22990440690194192, + "flos": 19865137086720.0, + "grad_norm": 2.649085475940073, + "language_loss": 0.80906856, + "learning_rate": 3.5953991146573504e-06, + "loss": 0.83114481, + "num_input_tokens_seen": 40779955, + "step": 1912, + "time_per_iteration": 2.576812505722046 + }, + { + "auxiliary_loss_clip": 0.01199708, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.05798328, + "balance_loss_mlp": 1.02240396, + "epoch": 0.23002464979258103, + "flos": 13289246507520.0, + "grad_norm": 2.4439647535748903, + "language_loss": 0.83580017, + "learning_rate": 3.5949292308583294e-06, + "loss": 0.85812366, + "num_input_tokens_seen": 40793200, + "step": 1913, + "time_per_iteration": 2.4744374752044678 + }, + { + "auxiliary_loss_clip": 0.01214594, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.06406713, + "balance_loss_mlp": 1.0239476, + "epoch": 0.2301448926832201, + "flos": 22163779013760.0, + "grad_norm": 2.0029949690347464, + "language_loss": 0.80833125, + "learning_rate": 3.594459105109811e-06, + "loss": 0.83082575, + "num_input_tokens_seen": 40812380, + "step": 1914, + "time_per_iteration": 2.452862501144409 + }, + { + "auxiliary_loss_clip": 0.01202086, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.06376553, + "balance_loss_mlp": 1.02444339, + "epoch": 0.2302651355738592, + "flos": 20704477167360.0, + "grad_norm": 1.7716538625401077, + "language_loss": 0.81096971, + "learning_rate": 3.593988737483115e-06, + "loss": 0.83332288, + "num_input_tokens_seen": 40832320, + "step": 1915, + "time_per_iteration": 2.4938321113586426 + }, + { + "auxiliary_loss_clip": 0.01182161, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.0580945, + "balance_loss_mlp": 1.02159071, + "epoch": 0.23038537846449827, + "flos": 18588943797120.0, + "grad_norm": 1.9160218149132646, + "language_loss": 0.78065801, + "learning_rate": 3.5935181280495947e-06, + "loss": 0.8027944, + "num_input_tokens_seen": 40850900, + "step": 1916, + "time_per_iteration": 2.5129659175872803 + }, + { + "auxiliary_loss_clip": 0.01087867, + "auxiliary_loss_mlp": 0.01001513, + "balance_loss_clip": 1.02836192, + "balance_loss_mlp": 0.99984437, + "epoch": 0.23050562135513739, + "flos": 64224260190720.0, + "grad_norm": 0.9080719464070544, + "language_loss": 0.54271203, + "learning_rate": 3.5930472768806412e-06, + "loss": 0.56360584, + "num_input_tokens_seen": 40909570, + "step": 1917, + "time_per_iteration": 3.8663241863250732 + }, + { + "auxiliary_loss_clip": 0.01211672, + "auxiliary_loss_mlp": 0.01036062, + "balance_loss_clip": 1.06318474, + "balance_loss_mlp": 1.02603054, + "epoch": 0.23062586424577647, + "flos": 17313396952320.0, + "grad_norm": 2.302850048551058, + "language_loss": 0.77138782, + "learning_rate": 3.5925761840476826e-06, + "loss": 0.7938652, + "num_input_tokens_seen": 40928180, + "step": 1918, + "time_per_iteration": 3.3247687816619873 + }, + { + "auxiliary_loss_clip": 0.01178787, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.05813265, + "balance_loss_mlp": 1.02196872, + "epoch": 0.23074610713641555, + "flos": 27855979194240.0, + "grad_norm": 7.456738816837622, + "language_loss": 0.81363225, + "learning_rate": 3.592104849622183e-06, + "loss": 0.83573157, + "num_input_tokens_seen": 40950435, + "step": 1919, + "time_per_iteration": 3.3578546047210693 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.05322242, + "balance_loss_mlp": 1.02280807, + "epoch": 0.23086635002705466, + "flos": 28841798937600.0, + "grad_norm": 1.5063473308876887, + "language_loss": 0.73119915, + "learning_rate": 3.591633273675644e-06, + "loss": 0.75295943, + "num_input_tokens_seen": 40972670, + "step": 1920, + "time_per_iteration": 2.6541922092437744 + }, + { + "auxiliary_loss_clip": 0.01064178, + "auxiliary_loss_mlp": 0.01008834, + "balance_loss_clip": 1.02913833, + "balance_loss_mlp": 1.00650966, + "epoch": 0.23098659291769374, + "flos": 62923681566720.0, + "grad_norm": 0.9073545390728627, + "language_loss": 0.58245361, + "learning_rate": 3.591161456279602e-06, + "loss": 0.60318375, + "num_input_tokens_seen": 41018215, + "step": 1921, + "time_per_iteration": 2.92101788520813 + }, + { + "auxiliary_loss_clip": 0.01186544, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0567441, + "balance_loss_mlp": 1.02385259, + "epoch": 0.23110683580833283, + "flos": 23476816679040.0, + "grad_norm": 1.5056857106502466, + "language_loss": 0.80467165, + "learning_rate": 3.590689397505633e-06, + "loss": 0.82687235, + "num_input_tokens_seen": 41039125, + "step": 1922, + "time_per_iteration": 2.712777853012085 + }, + { + "auxiliary_loss_clip": 0.01210273, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.06129718, + "balance_loss_mlp": 1.01991057, + "epoch": 0.2312270786989719, + "flos": 27271066124160.0, + "grad_norm": 1.6353858722836059, + "language_loss": 0.86771405, + "learning_rate": 3.590217097425347e-06, + "loss": 0.89011079, + "num_input_tokens_seen": 41059025, + "step": 1923, + "time_per_iteration": 2.529050827026367 + }, + { + "auxiliary_loss_clip": 0.01215289, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.06352818, + "balance_loss_mlp": 1.02207446, + "epoch": 0.23134732158961102, + "flos": 13261344618240.0, + "grad_norm": 1.7843459673148958, + "language_loss": 0.71058226, + "learning_rate": 3.589744556110391e-06, + "loss": 0.73305655, + "num_input_tokens_seen": 41077015, + "step": 1924, + "time_per_iteration": 2.439056158065796 + }, + { + "auxiliary_loss_clip": 0.01179935, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.05626798, + "balance_loss_mlp": 1.02564418, + "epoch": 0.2314675644802501, + "flos": 36977648250240.0, + "grad_norm": 1.6877289085880027, + "language_loss": 0.84338379, + "learning_rate": 3.58927177363245e-06, + "loss": 0.86552632, + "num_input_tokens_seen": 41099840, + "step": 1925, + "time_per_iteration": 2.697558641433716 + }, + { + "auxiliary_loss_clip": 0.01163788, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.05415154, + "balance_loss_mlp": 1.02501678, + "epoch": 0.2315878073708892, + "flos": 23842207779840.0, + "grad_norm": 2.3634779859918296, + "language_loss": 0.72541994, + "learning_rate": 3.5887987500632447e-06, + "loss": 0.74741197, + "num_input_tokens_seen": 41117845, + "step": 1926, + "time_per_iteration": 2.5524322986602783 + }, + { + "auxiliary_loss_clip": 0.01169859, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.05530918, + "balance_loss_mlp": 1.02742767, + "epoch": 0.2317080502615283, + "flos": 23039424766080.0, + "grad_norm": 1.8556639998048141, + "language_loss": 0.84464228, + "learning_rate": 3.5883254854745325e-06, + "loss": 0.86670184, + "num_input_tokens_seen": 41136235, + "step": 1927, + "time_per_iteration": 2.6146647930145264 + }, + { + "auxiliary_loss_clip": 0.0119896, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.05650103, + "balance_loss_mlp": 1.0221529, + "epoch": 0.23182829315216738, + "flos": 11254656435840.0, + "grad_norm": 1.9891985581114355, + "language_loss": 0.75201017, + "learning_rate": 3.587851979938107e-06, + "loss": 0.7743178, + "num_input_tokens_seen": 41153125, + "step": 1928, + "time_per_iteration": 2.4627838134765625 + }, + { + "auxiliary_loss_clip": 0.0119501, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.05958784, + "balance_loss_mlp": 1.02259636, + "epoch": 0.23194853604280646, + "flos": 19828939155840.0, + "grad_norm": 2.727395033535805, + "language_loss": 0.77979016, + "learning_rate": 3.5873782335257985e-06, + "loss": 0.80206043, + "num_input_tokens_seen": 41171290, + "step": 1929, + "time_per_iteration": 2.4804580211639404 + }, + { + "auxiliary_loss_clip": 0.01163595, + "auxiliary_loss_mlp": 0.01036443, + "balance_loss_clip": 1.05667007, + "balance_loss_mlp": 1.02674484, + "epoch": 0.23206877893344555, + "flos": 15305020830720.0, + "grad_norm": 2.197442347653758, + "language_loss": 0.78496575, + "learning_rate": 3.5869042463094744e-06, + "loss": 0.80696607, + "num_input_tokens_seen": 41189005, + "step": 1930, + "time_per_iteration": 2.513066053390503 + }, + { + "auxiliary_loss_clip": 0.01131983, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.04848385, + "balance_loss_mlp": 1.02873111, + "epoch": 0.23218902182408466, + "flos": 22711488572160.0, + "grad_norm": 1.9071195582850453, + "language_loss": 0.76829469, + "learning_rate": 3.586430018361038e-06, + "loss": 0.79000306, + "num_input_tokens_seen": 41208775, + "step": 1931, + "time_per_iteration": 2.6066949367523193 + }, + { + "auxiliary_loss_clip": 0.01166604, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.05240917, + "balance_loss_mlp": 1.02277541, + "epoch": 0.23230926471472374, + "flos": 22710734386560.0, + "grad_norm": 2.443180941755307, + "language_loss": 0.76346928, + "learning_rate": 3.5859555497524283e-06, + "loss": 0.78546685, + "num_input_tokens_seen": 41226010, + "step": 1932, + "time_per_iteration": 2.5034677982330322 + }, + { + "auxiliary_loss_clip": 0.01195427, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.05957794, + "balance_loss_mlp": 1.02704334, + "epoch": 0.23242950760536282, + "flos": 20375499479040.0, + "grad_norm": 1.9287028496982914, + "language_loss": 0.91858327, + "learning_rate": 3.5854808405556237e-06, + "loss": 0.94089758, + "num_input_tokens_seen": 41245245, + "step": 1933, + "time_per_iteration": 2.479234457015991 + }, + { + "auxiliary_loss_clip": 0.01164682, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.05380058, + "balance_loss_mlp": 1.02699971, + "epoch": 0.23254975049600193, + "flos": 16908324301440.0, + "grad_norm": 2.2676536556648554, + "language_loss": 0.74906296, + "learning_rate": 3.5850058908426355e-06, + "loss": 0.77106261, + "num_input_tokens_seen": 41263795, + "step": 1934, + "time_per_iteration": 2.5206565856933594 + }, + { + "auxiliary_loss_clip": 0.01182763, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.05431151, + "balance_loss_mlp": 1.02611232, + "epoch": 0.23266999338664102, + "flos": 23294821443840.0, + "grad_norm": 1.7005426418367304, + "language_loss": 0.85302925, + "learning_rate": 3.584530700685514e-06, + "loss": 0.87520927, + "num_input_tokens_seen": 41284055, + "step": 1935, + "time_per_iteration": 2.5419108867645264 + }, + { + "auxiliary_loss_clip": 0.01181153, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.06176329, + "balance_loss_mlp": 1.02011478, + "epoch": 0.2327902362772801, + "flos": 19569987031680.0, + "grad_norm": 1.9727706416318, + "language_loss": 0.88636446, + "learning_rate": 3.5840552701563448e-06, + "loss": 0.90846944, + "num_input_tokens_seen": 41300255, + "step": 1936, + "time_per_iteration": 2.4959185123443604 + }, + { + "auxiliary_loss_clip": 0.01184918, + "auxiliary_loss_mlp": 0.01032039, + "balance_loss_clip": 1.05404174, + "balance_loss_mlp": 1.02241278, + "epoch": 0.2329104791679192, + "flos": 16727514215040.0, + "grad_norm": 2.3995711887507736, + "language_loss": 0.82066882, + "learning_rate": 3.5835795993272513e-06, + "loss": 0.84283841, + "num_input_tokens_seen": 41318540, + "step": 1937, + "time_per_iteration": 2.5202860832214355 + }, + { + "auxiliary_loss_clip": 0.01124891, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.03323257, + "epoch": 0.2330307220585583, + "flos": 22163743100160.0, + "grad_norm": 4.447855821462046, + "language_loss": 0.71046537, + "learning_rate": 3.583103688270391e-06, + "loss": 0.73214501, + "num_input_tokens_seen": 41338320, + "step": 1938, + "time_per_iteration": 3.511669874191284 + }, + { + "auxiliary_loss_clip": 0.01167642, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.05252004, + "balance_loss_mlp": 1.02556562, + "epoch": 0.23315096494919738, + "flos": 19317319787520.0, + "grad_norm": 2.0946630408222084, + "language_loss": 0.89410293, + "learning_rate": 3.58262753705796e-06, + "loss": 0.91614395, + "num_input_tokens_seen": 41353210, + "step": 1939, + "time_per_iteration": 2.519331693649292 + }, + { + "auxiliary_loss_clip": 0.01094488, + "auxiliary_loss_mlp": 0.01009485, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.00788724, + "epoch": 0.23327120783983646, + "flos": 53031048946560.0, + "grad_norm": 0.762076360197271, + "language_loss": 0.55555022, + "learning_rate": 3.5821511457621902e-06, + "loss": 0.57658994, + "num_input_tokens_seen": 41410510, + "step": 1940, + "time_per_iteration": 3.0885448455810547 + }, + { + "auxiliary_loss_clip": 0.01178312, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.05692327, + "balance_loss_mlp": 1.02673125, + "epoch": 0.23339145073047557, + "flos": 17126984344320.0, + "grad_norm": 3.0561073140637784, + "language_loss": 0.81302071, + "learning_rate": 3.5816745144553497e-06, + "loss": 0.83517313, + "num_input_tokens_seen": 41425830, + "step": 1941, + "time_per_iteration": 2.4839398860931396 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.05399966, + "balance_loss_mlp": 1.01841247, + "epoch": 0.23351169362111465, + "flos": 13078918419840.0, + "grad_norm": 2.814779265914372, + "language_loss": 0.75493437, + "learning_rate": 3.5811976432097424e-06, + "loss": 0.77667266, + "num_input_tokens_seen": 41443500, + "step": 1942, + "time_per_iteration": 2.5535335540771484 + }, + { + "auxiliary_loss_clip": 0.01195001, + "auxiliary_loss_mlp": 0.00764066, + "balance_loss_clip": 1.06108856, + "balance_loss_mlp": 1.00104523, + "epoch": 0.23363193651175373, + "flos": 15851257931520.0, + "grad_norm": 2.2281377174652794, + "language_loss": 0.84510922, + "learning_rate": 3.58072053209771e-06, + "loss": 0.86469984, + "num_input_tokens_seen": 41460055, + "step": 1943, + "time_per_iteration": 2.499406099319458 + }, + { + "auxiliary_loss_clip": 0.01173637, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.05330825, + "balance_loss_mlp": 1.0238533, + "epoch": 0.23375217940239285, + "flos": 21025769345280.0, + "grad_norm": 2.0944955210956797, + "language_loss": 0.7908181, + "learning_rate": 3.5802431811916296e-06, + "loss": 0.81289077, + "num_input_tokens_seen": 41476665, + "step": 1944, + "time_per_iteration": 4.196576356887817 + }, + { + "auxiliary_loss_clip": 0.0117512, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.05613697, + "balance_loss_mlp": 1.01920271, + "epoch": 0.23387242229303193, + "flos": 20594698225920.0, + "grad_norm": 1.9070219817968121, + "language_loss": 0.80690712, + "learning_rate": 3.579765590563916e-06, + "loss": 0.82894051, + "num_input_tokens_seen": 41496065, + "step": 1945, + "time_per_iteration": 3.342613458633423 + }, + { + "auxiliary_loss_clip": 0.01183395, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.05560887, + "balance_loss_mlp": 1.02302241, + "epoch": 0.233992665183671, + "flos": 24279491952000.0, + "grad_norm": 1.8303253083666764, + "language_loss": 0.81812763, + "learning_rate": 3.579287760287017e-06, + "loss": 0.84028673, + "num_input_tokens_seen": 41516815, + "step": 1946, + "time_per_iteration": 2.544931173324585 + }, + { + "auxiliary_loss_clip": 0.0119233, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.05827081, + "balance_loss_mlp": 1.02507567, + "epoch": 0.2341129080743101, + "flos": 30154621121280.0, + "grad_norm": 1.8392033690714538, + "language_loss": 0.72793335, + "learning_rate": 3.578809690433421e-06, + "loss": 0.75019777, + "num_input_tokens_seen": 41538525, + "step": 1947, + "time_per_iteration": 2.5669567584991455 + }, + { + "auxiliary_loss_clip": 0.01215407, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.06262004, + "balance_loss_mlp": 1.02676547, + "epoch": 0.2342331509649492, + "flos": 22784135829120.0, + "grad_norm": 4.005893996273609, + "language_loss": 0.81113994, + "learning_rate": 3.578331381075651e-06, + "loss": 0.83366013, + "num_input_tokens_seen": 41559025, + "step": 1948, + "time_per_iteration": 2.4530582427978516 + }, + { + "auxiliary_loss_clip": 0.01196052, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.05800653, + "balance_loss_mlp": 1.02032363, + "epoch": 0.2343533938555883, + "flos": 23623152687360.0, + "grad_norm": 2.29998884192404, + "language_loss": 0.69959247, + "learning_rate": 3.5778528322862646e-06, + "loss": 0.72185367, + "num_input_tokens_seen": 41577845, + "step": 1949, + "time_per_iteration": 2.4965648651123047 + }, + { + "auxiliary_loss_clip": 0.01196498, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.05714893, + "balance_loss_mlp": 1.02231812, + "epoch": 0.23447363674622737, + "flos": 24570332375040.0, + "grad_norm": 1.4333363379706736, + "language_loss": 0.8650347, + "learning_rate": 3.5773740441378585e-06, + "loss": 0.88731283, + "num_input_tokens_seen": 41598600, + "step": 1950, + "time_per_iteration": 2.4947404861450195 + }, + { + "auxiliary_loss_clip": 0.01191019, + "auxiliary_loss_mlp": 0.01036777, + "balance_loss_clip": 1.05705655, + "balance_loss_mlp": 1.0277946, + "epoch": 0.23459387963686648, + "flos": 53140322119680.0, + "grad_norm": 1.7019286599252261, + "language_loss": 0.73543864, + "learning_rate": 3.5768950167030633e-06, + "loss": 0.75771654, + "num_input_tokens_seen": 41623300, + "step": 1951, + "time_per_iteration": 2.832526445388794 + }, + { + "auxiliary_loss_clip": 0.01169855, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.05426359, + "balance_loss_mlp": 1.02863121, + "epoch": 0.23471412252750556, + "flos": 23951412103680.0, + "grad_norm": 1.8080582810706136, + "language_loss": 0.78592414, + "learning_rate": 3.576415750054548e-06, + "loss": 0.80801123, + "num_input_tokens_seen": 41643420, + "step": 1952, + "time_per_iteration": 2.5353944301605225 + }, + { + "auxiliary_loss_clip": 0.01171031, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.05397022, + "balance_loss_mlp": 1.01798141, + "epoch": 0.23483436541814465, + "flos": 15706573948800.0, + "grad_norm": 2.0703214103623306, + "language_loss": 0.85655081, + "learning_rate": 3.5759362442650172e-06, + "loss": 0.87853515, + "num_input_tokens_seen": 41660170, + "step": 1953, + "time_per_iteration": 2.4940755367279053 + }, + { + "auxiliary_loss_clip": 0.01197657, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.06208384, + "balance_loss_mlp": 1.02786446, + "epoch": 0.23495460830878373, + "flos": 24936262179840.0, + "grad_norm": 2.2820312120406805, + "language_loss": 0.85677314, + "learning_rate": 3.5754564994072113e-06, + "loss": 0.87912667, + "num_input_tokens_seen": 41679010, + "step": 1954, + "time_per_iteration": 2.507598876953125 + }, + { + "auxiliary_loss_clip": 0.01177887, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.05579591, + "balance_loss_mlp": 1.01818132, + "epoch": 0.23507485119942284, + "flos": 30482665056000.0, + "grad_norm": 3.449529449489836, + "language_loss": 0.59643817, + "learning_rate": 3.5749765155539067e-06, + "loss": 0.61850238, + "num_input_tokens_seen": 41699495, + "step": 1955, + "time_per_iteration": 2.6668214797973633 + }, + { + "auxiliary_loss_clip": 0.01162357, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.05296242, + "balance_loss_mlp": 1.02195013, + "epoch": 0.23519509409006192, + "flos": 18329129746560.0, + "grad_norm": 2.0889383424788512, + "language_loss": 0.92178923, + "learning_rate": 3.574496292777917e-06, + "loss": 0.94373298, + "num_input_tokens_seen": 41717705, + "step": 1956, + "time_per_iteration": 2.519028663635254 + }, + { + "auxiliary_loss_clip": 0.01188259, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.05840158, + "balance_loss_mlp": 1.02860069, + "epoch": 0.235315336980701, + "flos": 29643217234560.0, + "grad_norm": 1.9944505516913964, + "language_loss": 0.71760261, + "learning_rate": 3.574015831152092e-06, + "loss": 0.73987889, + "num_input_tokens_seen": 41738120, + "step": 1957, + "time_per_iteration": 2.5755717754364014 + }, + { + "auxiliary_loss_clip": 0.01171126, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.05480516, + "balance_loss_mlp": 1.01953769, + "epoch": 0.23543557987134012, + "flos": 18551704371840.0, + "grad_norm": 2.1748782295751448, + "language_loss": 0.83523822, + "learning_rate": 3.573535130749316e-06, + "loss": 0.8572371, + "num_input_tokens_seen": 41756070, + "step": 1958, + "time_per_iteration": 2.4874367713928223 + }, + { + "auxiliary_loss_clip": 0.01171534, + "auxiliary_loss_mlp": 0.0103606, + "balance_loss_clip": 1.05612242, + "balance_loss_mlp": 1.0266242, + "epoch": 0.2355558227619792, + "flos": 24679033908480.0, + "grad_norm": 1.6505177091237302, + "language_loss": 0.7380923, + "learning_rate": 3.5730541916425127e-06, + "loss": 0.76016819, + "num_input_tokens_seen": 41777550, + "step": 1959, + "time_per_iteration": 2.5593602657318115 + }, + { + "auxiliary_loss_clip": 0.0116974, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.05687869, + "balance_loss_mlp": 1.02144718, + "epoch": 0.23567606565261828, + "flos": 21944795748480.0, + "grad_norm": 2.191747657827774, + "language_loss": 0.86217797, + "learning_rate": 3.572573013904639e-06, + "loss": 0.88418388, + "num_input_tokens_seen": 41797460, + "step": 1960, + "time_per_iteration": 2.5693535804748535 + }, + { + "auxiliary_loss_clip": 0.01209579, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.05912602, + "balance_loss_mlp": 1.01752019, + "epoch": 0.2357963085432574, + "flos": 13589352639360.0, + "grad_norm": 2.1483254493480497, + "language_loss": 0.92030525, + "learning_rate": 3.572091597608689e-06, + "loss": 0.94267046, + "num_input_tokens_seen": 41815585, + "step": 1961, + "time_per_iteration": 2.4453775882720947 + }, + { + "auxiliary_loss_clip": 0.01187965, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.0585885, + "balance_loss_mlp": 1.0214448, + "epoch": 0.23591655143389648, + "flos": 22088689632000.0, + "grad_norm": 2.082642869489746, + "language_loss": 0.73624814, + "learning_rate": 3.571609942827694e-06, + "loss": 0.75844616, + "num_input_tokens_seen": 41834700, + "step": 1962, + "time_per_iteration": 2.519336700439453 + }, + { + "auxiliary_loss_clip": 0.01180052, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.05754554, + "balance_loss_mlp": 1.02139783, + "epoch": 0.23603679432453556, + "flos": 17017349057280.0, + "grad_norm": 2.0141837876752233, + "language_loss": 0.88374054, + "learning_rate": 3.57112804963472e-06, + "loss": 0.90584767, + "num_input_tokens_seen": 41852915, + "step": 1963, + "time_per_iteration": 2.499077796936035 + }, + { + "auxiliary_loss_clip": 0.01163569, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.05818379, + "balance_loss_mlp": 1.02410722, + "epoch": 0.23615703721517464, + "flos": 19171307001600.0, + "grad_norm": 1.8335817382633506, + "language_loss": 0.76499212, + "learning_rate": 3.57064591810287e-06, + "loss": 0.7869575, + "num_input_tokens_seen": 41870415, + "step": 1964, + "time_per_iteration": 2.5408272743225098 + }, + { + "auxiliary_loss_clip": 0.01209766, + "auxiliary_loss_mlp": 0.00764064, + "balance_loss_clip": 1.06146097, + "balance_loss_mlp": 1.00107098, + "epoch": 0.23627728010581375, + "flos": 19098803399040.0, + "grad_norm": 2.091247943689354, + "language_loss": 0.81079847, + "learning_rate": 3.570163548305284e-06, + "loss": 0.83053684, + "num_input_tokens_seen": 41889345, + "step": 1965, + "time_per_iteration": 3.2580833435058594 + }, + { + "auxiliary_loss_clip": 0.01179277, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.05619395, + "balance_loss_mlp": 1.02648187, + "epoch": 0.23639752299645284, + "flos": 14282213057280.0, + "grad_norm": 2.225712014217286, + "language_loss": 0.69493705, + "learning_rate": 3.569680940315135e-06, + "loss": 0.7170974, + "num_input_tokens_seen": 41905745, + "step": 1966, + "time_per_iteration": 2.5123424530029297 + }, + { + "auxiliary_loss_clip": 0.01171246, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.05665433, + "balance_loss_mlp": 1.02585959, + "epoch": 0.23651776588709192, + "flos": 22893411980160.0, + "grad_norm": 1.8032376027295576, + "language_loss": 0.82164556, + "learning_rate": 3.5691980942056356e-06, + "loss": 0.84372449, + "num_input_tokens_seen": 41925115, + "step": 1967, + "time_per_iteration": 2.562812328338623 + }, + { + "auxiliary_loss_clip": 0.01200229, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.05865979, + "balance_loss_mlp": 1.01964009, + "epoch": 0.23663800877773103, + "flos": 18624531196800.0, + "grad_norm": 1.8431660051754692, + "language_loss": 0.79439557, + "learning_rate": 3.5687150100500332e-06, + "loss": 0.81669039, + "num_input_tokens_seen": 41944815, + "step": 1968, + "time_per_iteration": 2.5219881534576416 + }, + { + "auxiliary_loss_clip": 0.01194978, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.05712223, + "balance_loss_mlp": 1.01887512, + "epoch": 0.2367582516683701, + "flos": 25555828896000.0, + "grad_norm": 1.5966483987149835, + "language_loss": 0.74536216, + "learning_rate": 3.568231687921611e-06, + "loss": 0.76759714, + "num_input_tokens_seen": 41964990, + "step": 1969, + "time_per_iteration": 2.592735528945923 + }, + { + "auxiliary_loss_clip": 0.01208422, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.06056547, + "balance_loss_mlp": 1.02361417, + "epoch": 0.2368784945590092, + "flos": 23295072839040.0, + "grad_norm": 1.4908503387488736, + "language_loss": 0.80382669, + "learning_rate": 3.5677481278936883e-06, + "loss": 0.82623649, + "num_input_tokens_seen": 41984570, + "step": 1970, + "time_per_iteration": 3.3571486473083496 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01003246, + "balance_loss_clip": 1.04770589, + "balance_loss_mlp": 1.00119519, + "epoch": 0.23699873744964828, + "flos": 69859291875840.0, + "grad_norm": 0.8305623939719402, + "language_loss": 0.57858795, + "learning_rate": 3.5672643300396214e-06, + "loss": 0.59968132, + "num_input_tokens_seen": 42053715, + "step": 1971, + "time_per_iteration": 4.0693888664245605 + }, + { + "auxiliary_loss_clip": 0.01166607, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.0561471, + "balance_loss_mlp": 1.02358079, + "epoch": 0.2371189803402874, + "flos": 21835052720640.0, + "grad_norm": 2.1581035283230445, + "language_loss": 0.67821872, + "learning_rate": 3.566780294432802e-06, + "loss": 0.70020628, + "num_input_tokens_seen": 42070890, + "step": 1972, + "time_per_iteration": 3.3236582279205322 + }, + { + "auxiliary_loss_clip": 0.012112, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.06122279, + "balance_loss_mlp": 1.0253408, + "epoch": 0.23723922323092647, + "flos": 21908490076800.0, + "grad_norm": 3.020341069181952, + "language_loss": 0.74790835, + "learning_rate": 3.566296021146657e-06, + "loss": 0.77036005, + "num_input_tokens_seen": 42090270, + "step": 1973, + "time_per_iteration": 2.476217746734619 + }, + { + "auxiliary_loss_clip": 0.01215522, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.06330132, + "balance_loss_mlp": 1.01981199, + "epoch": 0.23735946612156555, + "flos": 32708803380480.0, + "grad_norm": 1.714424796244191, + "language_loss": 0.73202336, + "learning_rate": 3.565811510254652e-06, + "loss": 0.75448, + "num_input_tokens_seen": 42111150, + "step": 1974, + "time_per_iteration": 2.55385684967041 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01022096, + "balance_loss_clip": 1.0378381, + "balance_loss_mlp": 1.01942599, + "epoch": 0.23747970901220466, + "flos": 70546944821760.0, + "grad_norm": 0.8250801290332915, + "language_loss": 0.58260167, + "learning_rate": 3.5653267618302845e-06, + "loss": 0.60386491, + "num_input_tokens_seen": 42178730, + "step": 1975, + "time_per_iteration": 3.116600275039673 + }, + { + "auxiliary_loss_clip": 0.01208879, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.05930519, + "balance_loss_mlp": 1.02248716, + "epoch": 0.23759995190284375, + "flos": 20849807594880.0, + "grad_norm": 2.0319083326706395, + "language_loss": 0.85615057, + "learning_rate": 3.564841775947093e-06, + "loss": 0.87856209, + "num_input_tokens_seen": 42199620, + "step": 1976, + "time_per_iteration": 2.463066577911377 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.05365968, + "balance_loss_mlp": 1.0239172, + "epoch": 0.23772019479348283, + "flos": 32921645420160.0, + "grad_norm": 2.3155159880116485, + "language_loss": 0.76085317, + "learning_rate": 3.5643565526786475e-06, + "loss": 0.78284192, + "num_input_tokens_seen": 42219560, + "step": 1977, + "time_per_iteration": 2.707880735397339 + }, + { + "auxiliary_loss_clip": 0.01212583, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.06224513, + "balance_loss_mlp": 1.02216387, + "epoch": 0.2378404376841219, + "flos": 32342765834880.0, + "grad_norm": 1.578334388010235, + "language_loss": 0.77168584, + "learning_rate": 3.5638710920985574e-06, + "loss": 0.79412812, + "num_input_tokens_seen": 42241020, + "step": 1978, + "time_per_iteration": 2.5489962100982666 + }, + { + "auxiliary_loss_clip": 0.01202672, + "auxiliary_loss_mlp": 0.00765138, + "balance_loss_clip": 1.0582931, + "balance_loss_mlp": 1.0010426, + "epoch": 0.23796068057476102, + "flos": 22997624313600.0, + "grad_norm": 2.1064573105054776, + "language_loss": 0.81533271, + "learning_rate": 3.5633853942804655e-06, + "loss": 0.83501077, + "num_input_tokens_seen": 42259345, + "step": 1979, + "time_per_iteration": 2.4966232776641846 + }, + { + "auxiliary_loss_clip": 0.01167636, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.05255985, + "balance_loss_mlp": 1.02381372, + "epoch": 0.2380809234654001, + "flos": 13480938414720.0, + "grad_norm": 3.389874887004538, + "language_loss": 0.76879197, + "learning_rate": 3.5628994592980527e-06, + "loss": 0.79080635, + "num_input_tokens_seen": 42277250, + "step": 1980, + "time_per_iteration": 2.5508618354797363 + }, + { + "auxiliary_loss_clip": 0.0121223, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.06083322, + "balance_loss_mlp": 1.02267718, + "epoch": 0.2382011663560392, + "flos": 16871803148160.0, + "grad_norm": 1.9230295745879118, + "language_loss": 0.70369422, + "learning_rate": 3.562413287225034e-06, + "loss": 0.7261318, + "num_input_tokens_seen": 42295360, + "step": 1981, + "time_per_iteration": 2.4266204833984375 + }, + { + "auxiliary_loss_clip": 0.01192312, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.05920398, + "balance_loss_mlp": 1.01951706, + "epoch": 0.2383214092466783, + "flos": 18441135331200.0, + "grad_norm": 2.189259462930049, + "language_loss": 0.89515364, + "learning_rate": 3.5619268781351623e-06, + "loss": 0.91736799, + "num_input_tokens_seen": 42313430, + "step": 1982, + "time_per_iteration": 2.4984307289123535 + }, + { + "auxiliary_loss_clip": 0.01176238, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.05834305, + "balance_loss_mlp": 1.02224338, + "epoch": 0.23844165213731738, + "flos": 19755717281280.0, + "grad_norm": 2.076334799203889, + "language_loss": 0.76870787, + "learning_rate": 3.5614402321022256e-06, + "loss": 0.79078031, + "num_input_tokens_seen": 42331260, + "step": 1983, + "time_per_iteration": 2.522869110107422 + }, + { + "auxiliary_loss_clip": 0.01143687, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.0510968, + "balance_loss_mlp": 1.02254772, + "epoch": 0.23856189502795647, + "flos": 23367360960000.0, + "grad_norm": 1.7014832108313558, + "language_loss": 0.87191939, + "learning_rate": 3.5609533492000463e-06, + "loss": 0.89367479, + "num_input_tokens_seen": 42350150, + "step": 1984, + "time_per_iteration": 2.61458158493042 + }, + { + "auxiliary_loss_clip": 0.01177556, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.05840349, + "balance_loss_mlp": 1.01724744, + "epoch": 0.23868213791859555, + "flos": 23475056912640.0, + "grad_norm": 2.0577482234676463, + "language_loss": 0.78511691, + "learning_rate": 3.560466229502485e-06, + "loss": 0.80716199, + "num_input_tokens_seen": 42369495, + "step": 1985, + "time_per_iteration": 2.5246098041534424 + }, + { + "auxiliary_loss_clip": 0.0117998, + "auxiliary_loss_mlp": 0.00763875, + "balance_loss_clip": 1.06073093, + "balance_loss_mlp": 1.00109267, + "epoch": 0.23880238080923466, + "flos": 16617340224000.0, + "grad_norm": 2.2642586534869737, + "language_loss": 0.8974998, + "learning_rate": 3.5599788730834384e-06, + "loss": 0.91693836, + "num_input_tokens_seen": 42387455, + "step": 1986, + "time_per_iteration": 2.4817707538604736 + }, + { + "auxiliary_loss_clip": 0.01200265, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.0603857, + "balance_loss_mlp": 1.01988411, + "epoch": 0.23892262369987374, + "flos": 17348409734400.0, + "grad_norm": 2.670884152669051, + "language_loss": 0.78372413, + "learning_rate": 3.559491280016836e-06, + "loss": 0.80601859, + "num_input_tokens_seen": 42405400, + "step": 1987, + "time_per_iteration": 2.4722206592559814 + }, + { + "auxiliary_loss_clip": 0.01182919, + "auxiliary_loss_mlp": 0.01039264, + "balance_loss_clip": 1.05933201, + "balance_loss_mlp": 1.02882707, + "epoch": 0.23904286659051283, + "flos": 22309899540480.0, + "grad_norm": 2.4691006407956957, + "language_loss": 0.70876753, + "learning_rate": 3.5590034503766465e-06, + "loss": 0.73098934, + "num_input_tokens_seen": 42425065, + "step": 1988, + "time_per_iteration": 2.526493549346924 + }, + { + "auxiliary_loss_clip": 0.01210234, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.06063533, + "balance_loss_mlp": 1.02621365, + "epoch": 0.23916310948115194, + "flos": 21178246579200.0, + "grad_norm": 4.9314699812013405, + "language_loss": 0.81078196, + "learning_rate": 3.558515384236874e-06, + "loss": 0.83323479, + "num_input_tokens_seen": 42442495, + "step": 1989, + "time_per_iteration": 2.4772684574127197 + }, + { + "auxiliary_loss_clip": 0.01157176, + "auxiliary_loss_mlp": 0.00764762, + "balance_loss_clip": 1.05485356, + "balance_loss_mlp": 1.00118685, + "epoch": 0.23928335237179102, + "flos": 14137349506560.0, + "grad_norm": 2.0335352429936515, + "language_loss": 0.84043908, + "learning_rate": 3.558027081671556e-06, + "loss": 0.85965842, + "num_input_tokens_seen": 42459480, + "step": 1990, + "time_per_iteration": 2.5206592082977295 + }, + { + "auxiliary_loss_clip": 0.01200441, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.05949664, + "balance_loss_mlp": 1.0281533, + "epoch": 0.2394035952624301, + "flos": 23769596436480.0, + "grad_norm": 1.708095185118585, + "language_loss": 0.69002587, + "learning_rate": 3.557538542754769e-06, + "loss": 0.71241796, + "num_input_tokens_seen": 42479175, + "step": 1991, + "time_per_iteration": 3.338768243789673 + }, + { + "auxiliary_loss_clip": 0.01212787, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.06277037, + "balance_loss_mlp": 1.0266912, + "epoch": 0.2395238381530692, + "flos": 24206198250240.0, + "grad_norm": 3.227478221003939, + "language_loss": 0.66544425, + "learning_rate": 3.557049767560623e-06, + "loss": 0.68793571, + "num_input_tokens_seen": 42498090, + "step": 1992, + "time_per_iteration": 2.4876692295074463 + }, + { + "auxiliary_loss_clip": 0.01155904, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.05894113, + "balance_loss_mlp": 1.02250493, + "epoch": 0.2396440810437083, + "flos": 25295763450240.0, + "grad_norm": 2.274049790923728, + "language_loss": 0.85965216, + "learning_rate": 3.5565607561632655e-06, + "loss": 0.88153195, + "num_input_tokens_seen": 42516930, + "step": 1993, + "time_per_iteration": 2.623361825942993 + }, + { + "auxiliary_loss_clip": 0.01177635, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.0570519, + "balance_loss_mlp": 1.01871979, + "epoch": 0.23976432393434738, + "flos": 28543093436160.0, + "grad_norm": 2.644441143435662, + "language_loss": 0.79429114, + "learning_rate": 3.5560715086368787e-06, + "loss": 0.81635857, + "num_input_tokens_seen": 42534800, + "step": 1994, + "time_per_iteration": 2.5808451175689697 + }, + { + "auxiliary_loss_clip": 0.01175713, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.0576731, + "balance_loss_mlp": 1.02399766, + "epoch": 0.23988456682498646, + "flos": 19494358945920.0, + "grad_norm": 2.2746090547770277, + "language_loss": 0.82297271, + "learning_rate": 3.5555820250556816e-06, + "loss": 0.84506285, + "num_input_tokens_seen": 42552000, + "step": 1995, + "time_per_iteration": 2.533167839050293 + }, + { + "auxiliary_loss_clip": 0.011888, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.06117821, + "balance_loss_mlp": 1.02594709, + "epoch": 0.24000480971562557, + "flos": 20266331068800.0, + "grad_norm": 2.381183328399539, + "language_loss": 0.69149089, + "learning_rate": 3.5550923054939278e-06, + "loss": 0.71373427, + "num_input_tokens_seen": 42571455, + "step": 1996, + "time_per_iteration": 2.5521271228790283 + }, + { + "auxiliary_loss_clip": 0.01146469, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.05402184, + "balance_loss_mlp": 1.02445889, + "epoch": 0.24012505260626466, + "flos": 25443176866560.0, + "grad_norm": 2.0017799039157533, + "language_loss": 0.74773115, + "learning_rate": 3.5546023500259083e-06, + "loss": 0.76953578, + "num_input_tokens_seen": 42592550, + "step": 1997, + "time_per_iteration": 4.325376987457275 + }, + { + "auxiliary_loss_clip": 0.0115644, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.05416584, + "balance_loss_mlp": 1.02372873, + "epoch": 0.24024529549690374, + "flos": 15553342529280.0, + "grad_norm": 4.136079711503388, + "language_loss": 0.80801117, + "learning_rate": 3.5541121587259477e-06, + "loss": 0.82991016, + "num_input_tokens_seen": 42610385, + "step": 1998, + "time_per_iteration": 3.39522123336792 + }, + { + "auxiliary_loss_clip": 0.01120171, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.04854107, + "balance_loss_mlp": 1.02526307, + "epoch": 0.24036553838754285, + "flos": 57122351867520.0, + "grad_norm": 0.8263216891795258, + "language_loss": 0.57880324, + "learning_rate": 3.553621731668408e-06, + "loss": 0.60027814, + "num_input_tokens_seen": 42673595, + "step": 1999, + "time_per_iteration": 3.084599256515503 + }, + { + "auxiliary_loss_clip": 0.01190031, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.05644226, + "balance_loss_mlp": 1.02538168, + "epoch": 0.24048578127818193, + "flos": 24969946158720.0, + "grad_norm": 2.173614219858502, + "language_loss": 0.83147275, + "learning_rate": 3.553131068927688e-06, + "loss": 0.85372829, + "num_input_tokens_seen": 42692000, + "step": 2000, + "time_per_iteration": 2.5484702587127686 + }, + { + "auxiliary_loss_clip": 0.01168126, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.0582186, + "balance_loss_mlp": 1.02546728, + "epoch": 0.24060602416882101, + "flos": 23330947547520.0, + "grad_norm": 1.7355793923444756, + "language_loss": 0.80381954, + "learning_rate": 3.552640170578219e-06, + "loss": 0.82583958, + "num_input_tokens_seen": 42712250, + "step": 2001, + "time_per_iteration": 2.5578243732452393 + }, + { + "auxiliary_loss_clip": 0.01184208, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.0605073, + "balance_loss_mlp": 1.03318691, + "epoch": 0.2407262670594601, + "flos": 14173260128640.0, + "grad_norm": 2.11599837347217, + "language_loss": 0.77785522, + "learning_rate": 3.5521490366944703e-06, + "loss": 0.80012, + "num_input_tokens_seen": 42729900, + "step": 2002, + "time_per_iteration": 2.518450975418091 + }, + { + "auxiliary_loss_clip": 0.01169127, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.05627108, + "balance_loss_mlp": 1.02252769, + "epoch": 0.2408465099500992, + "flos": 13663113217920.0, + "grad_norm": 2.6119038656090794, + "language_loss": 0.79861313, + "learning_rate": 3.5516576673509474e-06, + "loss": 0.82062197, + "num_input_tokens_seen": 42747900, + "step": 2003, + "time_per_iteration": 2.53810977935791 + }, + { + "auxiliary_loss_clip": 0.01215718, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.06410551, + "balance_loss_mlp": 1.0290246, + "epoch": 0.2409667528407383, + "flos": 31248029076480.0, + "grad_norm": 2.0195210994399795, + "language_loss": 0.86382061, + "learning_rate": 3.5511660626221896e-06, + "loss": 0.88636291, + "num_input_tokens_seen": 42768540, + "step": 2004, + "time_per_iteration": 2.553990364074707 + }, + { + "auxiliary_loss_clip": 0.01182337, + "auxiliary_loss_mlp": 0.00764808, + "balance_loss_clip": 1.06022203, + "balance_loss_mlp": 1.00137043, + "epoch": 0.24108699573137737, + "flos": 22199941031040.0, + "grad_norm": 2.370699556953951, + "language_loss": 0.89387584, + "learning_rate": 3.5506742225827744e-06, + "loss": 0.9133473, + "num_input_tokens_seen": 42785395, + "step": 2005, + "time_per_iteration": 2.5323922634124756 + }, + { + "auxiliary_loss_clip": 0.01170277, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.05847931, + "balance_loss_mlp": 1.02695262, + "epoch": 0.24120723862201648, + "flos": 26103035664000.0, + "grad_norm": 2.1499744919736767, + "language_loss": 0.9029125, + "learning_rate": 3.5501821473073116e-06, + "loss": 0.92498481, + "num_input_tokens_seen": 42801980, + "step": 2006, + "time_per_iteration": 2.579784870147705 + }, + { + "auxiliary_loss_clip": 0.01162718, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.05691576, + "balance_loss_mlp": 1.03118014, + "epoch": 0.24132748151265557, + "flos": 18624926246400.0, + "grad_norm": 2.021978399084634, + "language_loss": 0.86715221, + "learning_rate": 3.54968983687045e-06, + "loss": 0.88920188, + "num_input_tokens_seen": 42818850, + "step": 2007, + "time_per_iteration": 2.555131196975708 + }, + { + "auxiliary_loss_clip": 0.0118702, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_clip": 1.06143117, + "balance_loss_mlp": 1.03646731, + "epoch": 0.24144772440329465, + "flos": 15267673664640.0, + "grad_norm": 2.45496738679731, + "language_loss": 0.88954461, + "learning_rate": 3.549197291346872e-06, + "loss": 0.91189069, + "num_input_tokens_seen": 42835375, + "step": 2008, + "time_per_iteration": 2.5052084922790527 + }, + { + "auxiliary_loss_clip": 0.01200798, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.06058848, + "balance_loss_mlp": 1.02877605, + "epoch": 0.24156796729393373, + "flos": 24024274842240.0, + "grad_norm": 2.2877681148079785, + "language_loss": 0.79204863, + "learning_rate": 3.548704510811297e-06, + "loss": 0.81443906, + "num_input_tokens_seen": 42854570, + "step": 2009, + "time_per_iteration": 2.513949155807495 + }, + { + "auxiliary_loss_clip": 0.01159791, + "auxiliary_loss_mlp": 0.0104702, + "balance_loss_clip": 1.05521989, + "balance_loss_mlp": 1.03644013, + "epoch": 0.24168821018457284, + "flos": 26286790665600.0, + "grad_norm": 2.1958659209856344, + "language_loss": 0.75249618, + "learning_rate": 3.5482114953384787e-06, + "loss": 0.77456427, + "num_input_tokens_seen": 42873800, + "step": 2010, + "time_per_iteration": 2.5988929271698 + }, + { + "auxiliary_loss_clip": 0.01203421, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.06248856, + "balance_loss_mlp": 1.03088951, + "epoch": 0.24180845307521193, + "flos": 18223193560320.0, + "grad_norm": 3.09569395858439, + "language_loss": 0.84602809, + "learning_rate": 3.5477182450032077e-06, + "loss": 0.86847758, + "num_input_tokens_seen": 42892400, + "step": 2011, + "time_per_iteration": 2.4890429973602295 + }, + { + "auxiliary_loss_clip": 0.01196911, + "auxiliary_loss_mlp": 0.01042508, + "balance_loss_clip": 1.06060362, + "balance_loss_mlp": 1.03245819, + "epoch": 0.241928695965851, + "flos": 20449260057600.0, + "grad_norm": 2.4190500559330257, + "language_loss": 0.83505142, + "learning_rate": 3.5472247598803097e-06, + "loss": 0.85744566, + "num_input_tokens_seen": 42911745, + "step": 2012, + "time_per_iteration": 2.4939379692077637 + }, + { + "auxiliary_loss_clip": 0.01216312, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.06358266, + "balance_loss_mlp": 1.03129733, + "epoch": 0.24204893885649012, + "flos": 25556475340800.0, + "grad_norm": 2.5065725840685147, + "language_loss": 0.85111445, + "learning_rate": 3.546731040044645e-06, + "loss": 0.873694, + "num_input_tokens_seen": 42926915, + "step": 2013, + "time_per_iteration": 2.5154919624328613 + }, + { + "auxiliary_loss_clip": 0.01214434, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.06319857, + "balance_loss_mlp": 1.01926494, + "epoch": 0.2421691817471292, + "flos": 30660207004800.0, + "grad_norm": 1.8008223679215087, + "language_loss": 0.75196111, + "learning_rate": 3.546237085571112e-06, + "loss": 0.77439767, + "num_input_tokens_seen": 42945350, + "step": 2014, + "time_per_iteration": 2.5209712982177734 + }, + { + "auxiliary_loss_clip": 0.01201106, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.06409693, + "balance_loss_mlp": 1.02533054, + "epoch": 0.24228942463776829, + "flos": 21945011230080.0, + "grad_norm": 6.148319701296482, + "language_loss": 0.72269219, + "learning_rate": 3.5457428965346425e-06, + "loss": 0.74505156, + "num_input_tokens_seen": 42964290, + "step": 2015, + "time_per_iteration": 2.5096564292907715 + }, + { + "auxiliary_loss_clip": 0.01137212, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.05390656, + "balance_loss_mlp": 1.02496481, + "epoch": 0.2424096675284074, + "flos": 33984493879680.0, + "grad_norm": 1.5657157676524969, + "language_loss": 0.74686772, + "learning_rate": 3.545248473010205e-06, + "loss": 0.7685889, + "num_input_tokens_seen": 42987095, + "step": 2016, + "time_per_iteration": 2.7369117736816406 + }, + { + "auxiliary_loss_clip": 0.01218534, + "auxiliary_loss_mlp": 0.00765327, + "balance_loss_clip": 1.06256843, + "balance_loss_mlp": 1.00142455, + "epoch": 0.24252991041904648, + "flos": 21653416621440.0, + "grad_norm": 1.6743571227131462, + "language_loss": 0.87753975, + "learning_rate": 3.544753815072802e-06, + "loss": 0.89737833, + "num_input_tokens_seen": 43005750, + "step": 2017, + "time_per_iteration": 2.492324113845825 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04609799, + "balance_loss_mlp": 1.02315176, + "epoch": 0.24265015330968556, + "flos": 21870065502720.0, + "grad_norm": 1.9727245398722877, + "language_loss": 0.88129169, + "learning_rate": 3.544258922797474e-06, + "loss": 0.90276372, + "num_input_tokens_seen": 43023870, + "step": 2018, + "time_per_iteration": 3.4754931926727295 + }, + { + "auxiliary_loss_clip": 0.01213558, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.06346726, + "balance_loss_mlp": 1.02721834, + "epoch": 0.24277039620032465, + "flos": 25628260671360.0, + "grad_norm": 1.5793490770632446, + "language_loss": 0.77898896, + "learning_rate": 3.543763796259295e-06, + "loss": 0.80148989, + "num_input_tokens_seen": 43043825, + "step": 2019, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.0119975, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.06118834, + "balance_loss_mlp": 1.02820837, + "epoch": 0.24289063909096376, + "flos": 26286575184000.0, + "grad_norm": 1.7661350007665524, + "language_loss": 0.90859336, + "learning_rate": 3.5432684355333754e-06, + "loss": 0.93097615, + "num_input_tokens_seen": 43062480, + "step": 2020, + "time_per_iteration": 2.6600663661956787 + }, + { + "auxiliary_loss_clip": 0.01199005, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.05935287, + "balance_loss_mlp": 1.02935088, + "epoch": 0.24301088198160284, + "flos": 25075056332160.0, + "grad_norm": 4.423105025546754, + "language_loss": 0.76537555, + "learning_rate": 3.5427728406948613e-06, + "loss": 0.7877574, + "num_input_tokens_seen": 43081595, + "step": 2021, + "time_per_iteration": 2.519862651824951 + }, + { + "auxiliary_loss_clip": 0.01106161, + "auxiliary_loss_mlp": 0.01016923, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.01457453, + "epoch": 0.24313112487224192, + "flos": 69900948673920.0, + "grad_norm": 0.7546676576833723, + "language_loss": 0.57935798, + "learning_rate": 3.542277011818934e-06, + "loss": 0.60058886, + "num_input_tokens_seen": 43145430, + "step": 2022, + "time_per_iteration": 3.404602289199829 + }, + { + "auxiliary_loss_clip": 0.01186284, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.06215978, + "balance_loss_mlp": 1.02656412, + "epoch": 0.24325136776288103, + "flos": 40662334235520.0, + "grad_norm": 2.093529315317973, + "language_loss": 0.74164367, + "learning_rate": 3.5417809489808104e-06, + "loss": 0.76386392, + "num_input_tokens_seen": 43167040, + "step": 2023, + "time_per_iteration": 3.550360679626465 + }, + { + "auxiliary_loss_clip": 0.01201141, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.06249368, + "balance_loss_mlp": 1.02625513, + "epoch": 0.24337161065352012, + "flos": 25046400257280.0, + "grad_norm": 2.049317021195419, + "language_loss": 0.72529352, + "learning_rate": 3.5412846522557422e-06, + "loss": 0.74765539, + "num_input_tokens_seen": 43187930, + "step": 2024, + "time_per_iteration": 3.3872809410095215 + }, + { + "auxiliary_loss_clip": 0.01214038, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.06300068, + "balance_loss_mlp": 1.02782702, + "epoch": 0.2434918535441592, + "flos": 18661160090880.0, + "grad_norm": 2.0911263302162486, + "language_loss": 0.73977381, + "learning_rate": 3.540788121719018e-06, + "loss": 0.7622906, + "num_input_tokens_seen": 43206350, + "step": 2025, + "time_per_iteration": 3.2318644523620605 + }, + { + "auxiliary_loss_clip": 0.01161845, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.05774343, + "balance_loss_mlp": 1.02838254, + "epoch": 0.24361209643479828, + "flos": 23915142345600.0, + "grad_norm": 1.96263399866518, + "language_loss": 0.81964219, + "learning_rate": 3.5402913574459604e-06, + "loss": 0.84164679, + "num_input_tokens_seen": 43226255, + "step": 2026, + "time_per_iteration": 2.556619882583618 + }, + { + "auxiliary_loss_clip": 0.01131489, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.0484376, + "balance_loss_mlp": 1.02795196, + "epoch": 0.2437323393254374, + "flos": 28657505232000.0, + "grad_norm": 1.5884391313971205, + "language_loss": 0.86278164, + "learning_rate": 3.5397943595119297e-06, + "loss": 0.8844676, + "num_input_tokens_seen": 43247675, + "step": 2027, + "time_per_iteration": 2.677842378616333 + }, + { + "auxiliary_loss_clip": 0.01177597, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.05822158, + "balance_loss_mlp": 1.02702475, + "epoch": 0.24385258221607647, + "flos": 23550325862400.0, + "grad_norm": 2.9476236891476213, + "language_loss": 0.77395499, + "learning_rate": 3.5392971279923177e-06, + "loss": 0.79610747, + "num_input_tokens_seen": 43265895, + "step": 2028, + "time_per_iteration": 2.5468316078186035 + }, + { + "auxiliary_loss_clip": 0.01161203, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.05382395, + "balance_loss_mlp": 1.02217269, + "epoch": 0.24397282510671556, + "flos": 25336091445120.0, + "grad_norm": 2.155010711190702, + "language_loss": 0.82711738, + "learning_rate": 3.5387996629625557e-06, + "loss": 0.84906363, + "num_input_tokens_seen": 43283485, + "step": 2029, + "time_per_iteration": 2.5875539779663086 + }, + { + "auxiliary_loss_clip": 0.01136193, + "auxiliary_loss_mlp": 0.01001549, + "balance_loss_clip": 1.04746079, + "balance_loss_mlp": 0.99948645, + "epoch": 0.24409306799735467, + "flos": 65187421430400.0, + "grad_norm": 0.810109331667588, + "language_loss": 0.55020881, + "learning_rate": 3.5383019644981083e-06, + "loss": 0.57158619, + "num_input_tokens_seen": 43347180, + "step": 2030, + "time_per_iteration": 3.0763237476348877 + }, + { + "auxiliary_loss_clip": 0.01182364, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.05807924, + "balance_loss_mlp": 1.01784921, + "epoch": 0.24421331088799375, + "flos": 19537093152000.0, + "grad_norm": 2.361511047805532, + "language_loss": 0.72730899, + "learning_rate": 3.5378040326744763e-06, + "loss": 0.74941087, + "num_input_tokens_seen": 43366665, + "step": 2031, + "time_per_iteration": 2.498952865600586 + }, + { + "auxiliary_loss_clip": 0.01170289, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.05861604, + "balance_loss_mlp": 1.02385211, + "epoch": 0.24433355377863283, + "flos": 21068575378560.0, + "grad_norm": 2.4674421774485764, + "language_loss": 0.85472143, + "learning_rate": 3.5373058675671946e-06, + "loss": 0.87675679, + "num_input_tokens_seen": 43384670, + "step": 2032, + "time_per_iteration": 2.543724536895752 + }, + { + "auxiliary_loss_clip": 0.0114541, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.05329907, + "balance_loss_mlp": 1.02300346, + "epoch": 0.24445379666927192, + "flos": 22637189289600.0, + "grad_norm": 2.7095208164151057, + "language_loss": 0.72379386, + "learning_rate": 3.536807469251836e-06, + "loss": 0.74558318, + "num_input_tokens_seen": 43403825, + "step": 2033, + "time_per_iteration": 2.586548328399658 + }, + { + "auxiliary_loss_clip": 0.01172252, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.05458188, + "balance_loss_mlp": 1.0223093, + "epoch": 0.24457403955991103, + "flos": 21251612108160.0, + "grad_norm": 3.4754204057747735, + "language_loss": 0.82476127, + "learning_rate": 3.5363088378040055e-06, + "loss": 0.84680307, + "num_input_tokens_seen": 43422715, + "step": 2034, + "time_per_iteration": 2.580615997314453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.00754292, + "balance_loss_clip": 1.04519033, + "balance_loss_mlp": 1.0009644, + "epoch": 0.2446942824505501, + "flos": 66997820764800.0, + "grad_norm": 0.7546160432939819, + "language_loss": 0.64419895, + "learning_rate": 3.5358099732993463e-06, + "loss": 0.66307592, + "num_input_tokens_seen": 43481825, + "step": 2035, + "time_per_iteration": 2.9549691677093506 + }, + { + "auxiliary_loss_clip": 0.01188764, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.05916369, + "balance_loss_mlp": 1.02451491, + "epoch": 0.2448145253411892, + "flos": 20411122792320.0, + "grad_norm": 4.0376159961893725, + "language_loss": 0.89357877, + "learning_rate": 3.535310875813535e-06, + "loss": 0.91580617, + "num_input_tokens_seen": 43500220, + "step": 2036, + "time_per_iteration": 2.54103422164917 + }, + { + "auxiliary_loss_clip": 0.01195911, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.05919862, + "balance_loss_mlp": 1.02315331, + "epoch": 0.2449347682318283, + "flos": 28804739080320.0, + "grad_norm": 2.136723515496185, + "language_loss": 0.81929231, + "learning_rate": 3.5348115454222843e-06, + "loss": 0.84157741, + "num_input_tokens_seen": 43522805, + "step": 2037, + "time_per_iteration": 2.5451924800872803 + }, + { + "auxiliary_loss_clip": 0.01178184, + "auxiliary_loss_mlp": 0.01046084, + "balance_loss_clip": 1.05475402, + "balance_loss_mlp": 1.03624344, + "epoch": 0.2450550111224674, + "flos": 22528990546560.0, + "grad_norm": 2.1410221616501515, + "language_loss": 0.85668814, + "learning_rate": 3.5343119822013425e-06, + "loss": 0.87893081, + "num_input_tokens_seen": 43541915, + "step": 2038, + "time_per_iteration": 2.524336338043213 + }, + { + "auxiliary_loss_clip": 0.01204193, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_clip": 1.060987, + "balance_loss_mlp": 1.03256416, + "epoch": 0.24517525401310647, + "flos": 21759137326080.0, + "grad_norm": 3.1350782773695625, + "language_loss": 0.77415615, + "learning_rate": 3.533812186226493e-06, + "loss": 0.79662812, + "num_input_tokens_seen": 43562625, + "step": 2039, + "time_per_iteration": 2.4927306175231934 + }, + { + "auxiliary_loss_clip": 0.01207834, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.05925858, + "balance_loss_mlp": 1.0221417, + "epoch": 0.24529549690374555, + "flos": 25043311687680.0, + "grad_norm": 1.976807956764818, + "language_loss": 0.7574625, + "learning_rate": 3.5333121575735545e-06, + "loss": 0.77985442, + "num_input_tokens_seen": 43582265, + "step": 2040, + "time_per_iteration": 2.467381715774536 + }, + { + "auxiliary_loss_clip": 0.01179493, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.05795443, + "balance_loss_mlp": 1.02340174, + "epoch": 0.24541573979438466, + "flos": 32123638915200.0, + "grad_norm": 2.876660177603598, + "language_loss": 0.75867587, + "learning_rate": 3.532811896318381e-06, + "loss": 0.78080618, + "num_input_tokens_seen": 43604335, + "step": 2041, + "time_per_iteration": 2.621657609939575 + }, + { + "auxiliary_loss_clip": 0.01170441, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.05528152, + "balance_loss_mlp": 1.01926112, + "epoch": 0.24553598268502375, + "flos": 31357556622720.0, + "grad_norm": 2.0825100759679556, + "language_loss": 0.81932783, + "learning_rate": 3.5323114025368615e-06, + "loss": 0.84132552, + "num_input_tokens_seen": 43619400, + "step": 2042, + "time_per_iteration": 2.6108646392822266 + }, + { + "auxiliary_loss_clip": 0.01189871, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.05483294, + "balance_loss_mlp": 1.02075195, + "epoch": 0.24565622557566283, + "flos": 14027462824320.0, + "grad_norm": 2.184648304099261, + "language_loss": 0.8168236, + "learning_rate": 3.53181067630492e-06, + "loss": 0.8390249, + "num_input_tokens_seen": 43636870, + "step": 2043, + "time_per_iteration": 2.488929510116577 + }, + { + "auxiliary_loss_clip": 0.01172107, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.0547843, + "balance_loss_mlp": 1.03005052, + "epoch": 0.24577646846630194, + "flos": 16581465515520.0, + "grad_norm": 1.8214350373296109, + "language_loss": 0.76064312, + "learning_rate": 3.5313097176985175e-06, + "loss": 0.78276092, + "num_input_tokens_seen": 43655180, + "step": 2044, + "time_per_iteration": 2.494130849838257 + }, + { + "auxiliary_loss_clip": 0.01195588, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.05939984, + "balance_loss_mlp": 1.02100754, + "epoch": 0.24589671135694102, + "flos": 18807424272000.0, + "grad_norm": 1.8690621047120717, + "language_loss": 0.8122102, + "learning_rate": 3.5308085267936482e-06, + "loss": 0.83446932, + "num_input_tokens_seen": 43672895, + "step": 2045, + "time_per_iteration": 3.2762341499328613 + }, + { + "auxiliary_loss_clip": 0.01135645, + "auxiliary_loss_mlp": 0.00763297, + "balance_loss_clip": 1.05259132, + "balance_loss_mlp": 1.00127506, + "epoch": 0.2460169542475801, + "flos": 19938538529280.0, + "grad_norm": 1.7637586934780578, + "language_loss": 0.89747202, + "learning_rate": 3.530307103666342e-06, + "loss": 0.91646147, + "num_input_tokens_seen": 43691975, + "step": 2046, + "time_per_iteration": 2.6238620281219482 + }, + { + "auxiliary_loss_clip": 0.01172407, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.05668688, + "balance_loss_mlp": 1.02040827, + "epoch": 0.24613719713821922, + "flos": 24171221381760.0, + "grad_norm": 1.918258230136605, + "language_loss": 0.80333829, + "learning_rate": 3.5298054483926658e-06, + "loss": 0.82536077, + "num_input_tokens_seen": 43712670, + "step": 2047, + "time_per_iteration": 2.558009386062622 + }, + { + "auxiliary_loss_clip": 0.01204154, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.06022596, + "balance_loss_mlp": 1.02528846, + "epoch": 0.2462574400288583, + "flos": 30221055325440.0, + "grad_norm": 1.946700773247387, + "language_loss": 0.83041006, + "learning_rate": 3.5293035610487187e-06, + "loss": 0.85280013, + "num_input_tokens_seen": 43732035, + "step": 2048, + "time_per_iteration": 2.5432701110839844 + }, + { + "auxiliary_loss_clip": 0.01094238, + "auxiliary_loss_mlp": 0.0100003, + "balance_loss_clip": 1.03473091, + "balance_loss_mlp": 0.99782497, + "epoch": 0.24637768291949738, + "flos": 68943030819840.0, + "grad_norm": 0.7355315456194561, + "language_loss": 0.62013125, + "learning_rate": 3.5288014417106374e-06, + "loss": 0.64107394, + "num_input_tokens_seen": 43798055, + "step": 2049, + "time_per_iteration": 3.1210954189300537 + }, + { + "auxiliary_loss_clip": 0.01164488, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.05463457, + "balance_loss_mlp": 1.02562737, + "epoch": 0.24649792581013646, + "flos": 34383999922560.0, + "grad_norm": 1.8033945803821139, + "language_loss": 0.75461221, + "learning_rate": 3.528299090454593e-06, + "loss": 0.77660674, + "num_input_tokens_seen": 43818590, + "step": 2050, + "time_per_iteration": 3.580585241317749 + }, + { + "auxiliary_loss_clip": 0.01198203, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.05714953, + "balance_loss_mlp": 1.02179265, + "epoch": 0.24661816870077558, + "flos": 19680448331520.0, + "grad_norm": 2.4760690033290906, + "language_loss": 0.8288486, + "learning_rate": 3.527796507356792e-06, + "loss": 0.85114563, + "num_input_tokens_seen": 43832480, + "step": 2051, + "time_per_iteration": 3.2736921310424805 + }, + { + "auxiliary_loss_clip": 0.01198782, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.05769539, + "balance_loss_mlp": 1.02580333, + "epoch": 0.24673841159141466, + "flos": 20002279213440.0, + "grad_norm": 2.8511891206930087, + "language_loss": 0.90640426, + "learning_rate": 3.527293692493475e-06, + "loss": 0.9287442, + "num_input_tokens_seen": 43848345, + "step": 2052, + "time_per_iteration": 3.175246000289917 + }, + { + "auxiliary_loss_clip": 0.01197318, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.05705571, + "balance_loss_mlp": 1.02398872, + "epoch": 0.24685865448205374, + "flos": 21646593037440.0, + "grad_norm": 4.698295329890442, + "language_loss": 0.73035491, + "learning_rate": 3.52679064594092e-06, + "loss": 0.75266886, + "num_input_tokens_seen": 43865685, + "step": 2053, + "time_per_iteration": 2.487556219100952 + }, + { + "auxiliary_loss_clip": 0.01135895, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.04321694, + "balance_loss_mlp": 1.02548802, + "epoch": 0.24697889737269285, + "flos": 17960470508160.0, + "grad_norm": 2.366456713382833, + "language_loss": 0.74841893, + "learning_rate": 3.5262873677754375e-06, + "loss": 0.77012247, + "num_input_tokens_seen": 43883690, + "step": 2054, + "time_per_iteration": 2.579397439956665 + }, + { + "auxiliary_loss_clip": 0.01205296, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.05769897, + "balance_loss_mlp": 1.02456522, + "epoch": 0.24709914026333193, + "flos": 27344611221120.0, + "grad_norm": 1.5794420879481634, + "language_loss": 0.80695426, + "learning_rate": 3.5257838580733745e-06, + "loss": 0.82934558, + "num_input_tokens_seen": 43903295, + "step": 2055, + "time_per_iteration": 2.513293743133545 + }, + { + "auxiliary_loss_clip": 0.01198354, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.05797529, + "balance_loss_mlp": 1.02474952, + "epoch": 0.24721938315397102, + "flos": 19275519335040.0, + "grad_norm": 2.2452749283039934, + "language_loss": 0.87138367, + "learning_rate": 3.5252801169111138e-06, + "loss": 0.89371026, + "num_input_tokens_seen": 43920960, + "step": 2056, + "time_per_iteration": 2.4872045516967773 + }, + { + "auxiliary_loss_clip": 0.01176834, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.05707264, + "balance_loss_mlp": 1.02405167, + "epoch": 0.2473396260446101, + "flos": 23185796688000.0, + "grad_norm": 1.7779788533585361, + "language_loss": 0.79825628, + "learning_rate": 3.524776144365072e-06, + "loss": 0.82035679, + "num_input_tokens_seen": 43939415, + "step": 2057, + "time_per_iteration": 2.544431209564209 + }, + { + "auxiliary_loss_clip": 0.01172434, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.05627775, + "balance_loss_mlp": 1.02811563, + "epoch": 0.2474598689352492, + "flos": 21142443697920.0, + "grad_norm": 1.747216171799763, + "language_loss": 0.79203486, + "learning_rate": 3.5242719405117016e-06, + "loss": 0.8141374, + "num_input_tokens_seen": 43959220, + "step": 2058, + "time_per_iteration": 2.553508996963501 + }, + { + "auxiliary_loss_clip": 0.01183398, + "auxiliary_loss_mlp": 0.00764941, + "balance_loss_clip": 1.05795598, + "balance_loss_mlp": 1.00136673, + "epoch": 0.2475801118258883, + "flos": 21648352803840.0, + "grad_norm": 2.4071583400354455, + "language_loss": 0.75201744, + "learning_rate": 3.5237675054274893e-06, + "loss": 0.77150083, + "num_input_tokens_seen": 43978420, + "step": 2059, + "time_per_iteration": 2.564394235610962 + }, + { + "auxiliary_loss_clip": 0.01194892, + "auxiliary_loss_mlp": 0.01036708, + "balance_loss_clip": 1.05726445, + "balance_loss_mlp": 1.02686167, + "epoch": 0.24770035471652738, + "flos": 22674500542080.0, + "grad_norm": 1.9041185073414162, + "language_loss": 0.80239081, + "learning_rate": 3.5232628391889584e-06, + "loss": 0.82470685, + "num_input_tokens_seen": 43996710, + "step": 2060, + "time_per_iteration": 2.5405423641204834 + }, + { + "auxiliary_loss_clip": 0.01147297, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.05293083, + "balance_loss_mlp": 1.02075791, + "epoch": 0.2478205976071665, + "flos": 22163814927360.0, + "grad_norm": 2.552710114290554, + "language_loss": 0.64309239, + "learning_rate": 3.522757941872666e-06, + "loss": 0.66486293, + "num_input_tokens_seen": 44014865, + "step": 2061, + "time_per_iteration": 2.6115903854370117 + }, + { + "auxiliary_loss_clip": 0.0121193, + "auxiliary_loss_mlp": 0.00764735, + "balance_loss_clip": 1.06299806, + "balance_loss_mlp": 1.00151336, + "epoch": 0.24794084049780557, + "flos": 24973106555520.0, + "grad_norm": 1.9459748099389635, + "language_loss": 0.82584316, + "learning_rate": 3.5222528135552042e-06, + "loss": 0.84560978, + "num_input_tokens_seen": 44036325, + "step": 2062, + "time_per_iteration": 2.4948155879974365 + }, + { + "auxiliary_loss_clip": 0.01193669, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.05974042, + "balance_loss_mlp": 1.02858233, + "epoch": 0.24806108338844465, + "flos": 18296379521280.0, + "grad_norm": 2.0349655137334497, + "language_loss": 0.80289996, + "learning_rate": 3.521747454313201e-06, + "loss": 0.82521963, + "num_input_tokens_seen": 44055005, + "step": 2063, + "time_per_iteration": 2.5140719413757324 + }, + { + "auxiliary_loss_clip": 0.01154758, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.04835868, + "balance_loss_mlp": 1.02133822, + "epoch": 0.24818132627908374, + "flos": 19282163351040.0, + "grad_norm": 1.8442731566255053, + "language_loss": 0.66645384, + "learning_rate": 3.521241864223319e-06, + "loss": 0.68830836, + "num_input_tokens_seen": 44073965, + "step": 2064, + "time_per_iteration": 2.591148853302002 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01012281, + "balance_loss_clip": 1.03159118, + "balance_loss_mlp": 1.00978971, + "epoch": 0.24830156916972285, + "flos": 70285837881600.0, + "grad_norm": 0.7902384694965982, + "language_loss": 0.62013566, + "learning_rate": 3.5207360433622552e-06, + "loss": 0.64125466, + "num_input_tokens_seen": 44135965, + "step": 2065, + "time_per_iteration": 3.106696128845215 + }, + { + "auxiliary_loss_clip": 0.0117789, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.05839241, + "balance_loss_mlp": 1.02717733, + "epoch": 0.24842181206036193, + "flos": 40409128287360.0, + "grad_norm": 2.7841112066361813, + "language_loss": 0.74404866, + "learning_rate": 3.5202299918067437e-06, + "loss": 0.76619053, + "num_input_tokens_seen": 44159560, + "step": 2066, + "time_per_iteration": 2.7093608379364014 + }, + { + "auxiliary_loss_clip": 0.01191923, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.0574069, + "balance_loss_mlp": 1.02025592, + "epoch": 0.248542054951001, + "flos": 20082432412800.0, + "grad_norm": 2.520681619220884, + "language_loss": 0.69171578, + "learning_rate": 3.519723709633551e-06, + "loss": 0.71392739, + "num_input_tokens_seen": 44178320, + "step": 2067, + "time_per_iteration": 2.506783962249756 + }, + { + "auxiliary_loss_clip": 0.01176716, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.05890632, + "balance_loss_mlp": 1.02412593, + "epoch": 0.24866229784164012, + "flos": 23513948363520.0, + "grad_norm": 2.2137245662377234, + "language_loss": 0.83609009, + "learning_rate": 3.519217196919479e-06, + "loss": 0.85820091, + "num_input_tokens_seen": 44197305, + "step": 2068, + "time_per_iteration": 2.6375486850738525 + }, + { + "auxiliary_loss_clip": 0.01187801, + "auxiliary_loss_mlp": 0.01036683, + "balance_loss_clip": 1.06230497, + "balance_loss_mlp": 1.0276525, + "epoch": 0.2487825407322792, + "flos": 19865101173120.0, + "grad_norm": 1.7368374066876753, + "language_loss": 0.73603821, + "learning_rate": 3.518710453741367e-06, + "loss": 0.75828302, + "num_input_tokens_seen": 44216505, + "step": 2069, + "time_per_iteration": 2.523452043533325 + }, + { + "auxiliary_loss_clip": 0.01171708, + "auxiliary_loss_mlp": 0.0076475, + "balance_loss_clip": 1.0539875, + "balance_loss_mlp": 1.00146341, + "epoch": 0.2489027836229183, + "flos": 22017622573440.0, + "grad_norm": 2.8041739847238407, + "language_loss": 0.67833257, + "learning_rate": 3.518203480176086e-06, + "loss": 0.69769704, + "num_input_tokens_seen": 44235435, + "step": 2070, + "time_per_iteration": 2.554170846939087 + }, + { + "auxiliary_loss_clip": 0.01118645, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.04649854, + "balance_loss_mlp": 1.03166485, + "epoch": 0.2490230265135574, + "flos": 23294354567040.0, + "grad_norm": 1.7876822009766662, + "language_loss": 0.80530941, + "learning_rate": 3.517696276300545e-06, + "loss": 0.82690835, + "num_input_tokens_seen": 44256975, + "step": 2071, + "time_per_iteration": 2.6718549728393555 + }, + { + "auxiliary_loss_clip": 0.01201026, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.06533027, + "balance_loss_mlp": 1.02928329, + "epoch": 0.24914326940419648, + "flos": 19826784339840.0, + "grad_norm": 2.690801197211992, + "language_loss": 0.6918785, + "learning_rate": 3.517188842191685e-06, + "loss": 0.71428317, + "num_input_tokens_seen": 44275125, + "step": 2072, + "time_per_iteration": 3.311549425125122 + }, + { + "auxiliary_loss_clip": 0.01193792, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.05785728, + "balance_loss_mlp": 1.02098203, + "epoch": 0.24926351229483557, + "flos": 20229271211520.0, + "grad_norm": 1.6511159642335216, + "language_loss": 0.73940581, + "learning_rate": 3.5166811779264837e-06, + "loss": 0.76164967, + "num_input_tokens_seen": 44295445, + "step": 2073, + "time_per_iteration": 2.5805375576019287 + }, + { + "auxiliary_loss_clip": 0.01209578, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.0592593, + "balance_loss_mlp": 1.02302814, + "epoch": 0.24938375518547465, + "flos": 23294570048640.0, + "grad_norm": 1.7831994857452753, + "language_loss": 0.77649891, + "learning_rate": 3.5161732835819545e-06, + "loss": 0.79892445, + "num_input_tokens_seen": 44314755, + "step": 2074, + "time_per_iteration": 2.480890989303589 + }, + { + "auxiliary_loss_clip": 0.01213042, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.06322157, + "balance_loss_mlp": 1.02025867, + "epoch": 0.24950399807611376, + "flos": 17311673099520.0, + "grad_norm": 1.7685037465490576, + "language_loss": 0.83107197, + "learning_rate": 3.515665159235143e-06, + "loss": 0.85349727, + "num_input_tokens_seen": 44333640, + "step": 2075, + "time_per_iteration": 2.4774997234344482 + }, + { + "auxiliary_loss_clip": 0.01172965, + "auxiliary_loss_mlp": 0.01026145, + "balance_loss_clip": 1.05004966, + "balance_loss_mlp": 1.01793146, + "epoch": 0.24962424096675284, + "flos": 19024863252480.0, + "grad_norm": 1.5841661672194396, + "language_loss": 0.74937916, + "learning_rate": 3.5151568049631318e-06, + "loss": 0.77137029, + "num_input_tokens_seen": 44352355, + "step": 2076, + "time_per_iteration": 2.553252696990967 + }, + { + "auxiliary_loss_clip": 0.01209839, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.05929852, + "balance_loss_mlp": 1.02175391, + "epoch": 0.24974448385739192, + "flos": 33398790710400.0, + "grad_norm": 1.7465785249407617, + "language_loss": 0.80122817, + "learning_rate": 3.5146482208430385e-06, + "loss": 0.82364279, + "num_input_tokens_seen": 44374185, + "step": 2077, + "time_per_iteration": 3.498643159866333 + }, + { + "auxiliary_loss_clip": 0.01127088, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.04837871, + "balance_loss_mlp": 1.02331853, + "epoch": 0.24986472674803104, + "flos": 30007279532160.0, + "grad_norm": 1.8325132885915663, + "language_loss": 0.67639148, + "learning_rate": 3.514139406952014e-06, + "loss": 0.69800144, + "num_input_tokens_seen": 44396210, + "step": 2078, + "time_per_iteration": 3.4114110469818115 + }, + { + "auxiliary_loss_clip": 0.01195756, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.06046069, + "balance_loss_mlp": 1.02257836, + "epoch": 0.24998496963867012, + "flos": 26613074833920.0, + "grad_norm": 1.767743304935125, + "language_loss": 0.83352041, + "learning_rate": 3.5136303633672454e-06, + "loss": 0.85579538, + "num_input_tokens_seen": 44416340, + "step": 2079, + "time_per_iteration": 2.549199342727661 + }, + { + "auxiliary_loss_clip": 0.01174866, + "auxiliary_loss_mlp": 0.00764589, + "balance_loss_clip": 1.05822933, + "balance_loss_mlp": 1.00126028, + "epoch": 0.25010521252930923, + "flos": 23553989049600.0, + "grad_norm": 1.763635891382621, + "language_loss": 0.74358666, + "learning_rate": 3.5131210901659544e-06, + "loss": 0.76298118, + "num_input_tokens_seen": 44438095, + "step": 2080, + "time_per_iteration": 2.6323556900024414 + }, + { + "auxiliary_loss_clip": 0.01156598, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.0512861, + "balance_loss_mlp": 1.02250409, + "epoch": 0.2502254554199483, + "flos": 23441193365760.0, + "grad_norm": 2.245331530783253, + "language_loss": 0.81737947, + "learning_rate": 3.5126115874253967e-06, + "loss": 0.83926582, + "num_input_tokens_seen": 44457650, + "step": 2081, + "time_per_iteration": 2.5791807174682617 + }, + { + "auxiliary_loss_clip": 0.01166006, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.05755448, + "balance_loss_mlp": 1.02504325, + "epoch": 0.2503456983105874, + "flos": 28761681651840.0, + "grad_norm": 2.027573687189531, + "language_loss": 0.80711162, + "learning_rate": 3.5121018552228644e-06, + "loss": 0.82912087, + "num_input_tokens_seen": 44476155, + "step": 2082, + "time_per_iteration": 2.6169235706329346 + }, + { + "auxiliary_loss_clip": 0.01166833, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.05506468, + "balance_loss_mlp": 1.02041078, + "epoch": 0.2504659412012265, + "flos": 18770256673920.0, + "grad_norm": 2.029972118376554, + "language_loss": 0.76470101, + "learning_rate": 3.5115918936356827e-06, + "loss": 0.78667009, + "num_input_tokens_seen": 44492910, + "step": 2083, + "time_per_iteration": 2.552333354949951 + }, + { + "auxiliary_loss_clip": 0.01147884, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.0536449, + "balance_loss_mlp": 1.02694845, + "epoch": 0.25058618409186556, + "flos": 16873383346560.0, + "grad_norm": 2.370308756378618, + "language_loss": 0.78755164, + "learning_rate": 3.5110817027412123e-06, + "loss": 0.80939209, + "num_input_tokens_seen": 44512000, + "step": 2084, + "time_per_iteration": 2.5458931922912598 + }, + { + "auxiliary_loss_clip": 0.01157185, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.05020928, + "balance_loss_mlp": 1.02140415, + "epoch": 0.25070642698250467, + "flos": 24425540651520.0, + "grad_norm": 2.8180417453343094, + "language_loss": 0.68780911, + "learning_rate": 3.5105712826168493e-06, + "loss": 0.70968521, + "num_input_tokens_seen": 44531650, + "step": 2085, + "time_per_iteration": 2.596689224243164 + }, + { + "auxiliary_loss_clip": 0.01192986, + "auxiliary_loss_mlp": 0.00763298, + "balance_loss_clip": 1.05675495, + "balance_loss_mlp": 1.00116539, + "epoch": 0.2508266698731437, + "flos": 20260944028800.0, + "grad_norm": 1.9214801655449527, + "language_loss": 0.7067641, + "learning_rate": 3.5100606333400235e-06, + "loss": 0.72632694, + "num_input_tokens_seen": 44548785, + "step": 2086, + "time_per_iteration": 2.499660015106201 + }, + { + "auxiliary_loss_clip": 0.01191625, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.05773067, + "balance_loss_mlp": 1.02131391, + "epoch": 0.25094691276378284, + "flos": 19245318975360.0, + "grad_norm": 2.9429969861002228, + "language_loss": 0.77205259, + "learning_rate": 3.5095497549882006e-06, + "loss": 0.7942903, + "num_input_tokens_seen": 44567230, + "step": 2087, + "time_per_iteration": 2.5072739124298096 + }, + { + "auxiliary_loss_clip": 0.01200347, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.06318438, + "balance_loss_mlp": 1.02018487, + "epoch": 0.25106715565442195, + "flos": 26943237671040.0, + "grad_norm": 2.52724869028484, + "language_loss": 0.7207886, + "learning_rate": 3.50903864763888e-06, + "loss": 0.74309188, + "num_input_tokens_seen": 44588020, + "step": 2088, + "time_per_iteration": 2.5900368690490723 + }, + { + "auxiliary_loss_clip": 0.01200584, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.05949688, + "balance_loss_mlp": 1.02236116, + "epoch": 0.251187398545061, + "flos": 48359570572800.0, + "grad_norm": 1.9646511295915956, + "language_loss": 0.75727779, + "learning_rate": 3.5085273113695965e-06, + "loss": 0.77960068, + "num_input_tokens_seen": 44612590, + "step": 2089, + "time_per_iteration": 2.723231077194214 + }, + { + "auxiliary_loss_clip": 0.01210739, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.06029487, + "balance_loss_mlp": 1.02427125, + "epoch": 0.2513076414357001, + "flos": 27016100409600.0, + "grad_norm": 1.789578014643017, + "language_loss": 0.78525037, + "learning_rate": 3.508015746257919e-06, + "loss": 0.80769825, + "num_input_tokens_seen": 44631630, + "step": 2090, + "time_per_iteration": 2.5166873931884766 + }, + { + "auxiliary_loss_clip": 0.0116848, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.05636525, + "balance_loss_mlp": 1.02509272, + "epoch": 0.2514278843263392, + "flos": 19463619882240.0, + "grad_norm": 2.0204404147046375, + "language_loss": 0.83061755, + "learning_rate": 3.5075039523814518e-06, + "loss": 0.85265118, + "num_input_tokens_seen": 44650820, + "step": 2091, + "time_per_iteration": 2.564723491668701 + }, + { + "auxiliary_loss_clip": 0.01200617, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.05775988, + "balance_loss_mlp": 1.0213418, + "epoch": 0.2515481272169783, + "flos": 16866092885760.0, + "grad_norm": 2.188227869645875, + "language_loss": 0.81560993, + "learning_rate": 3.506991929817834e-06, + "loss": 0.83793342, + "num_input_tokens_seen": 44667540, + "step": 2092, + "time_per_iteration": 2.5234711170196533 + }, + { + "auxiliary_loss_clip": 0.01206147, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.06064868, + "balance_loss_mlp": 1.02123988, + "epoch": 0.2516683701076174, + "flos": 23732464752000.0, + "grad_norm": 1.777410296667334, + "language_loss": 0.82636571, + "learning_rate": 3.506479678644738e-06, + "loss": 0.84872711, + "num_input_tokens_seen": 44687935, + "step": 2093, + "time_per_iteration": 2.471876621246338 + }, + { + "auxiliary_loss_clip": 0.0114097, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.05137801, + "balance_loss_mlp": 1.02036881, + "epoch": 0.2517886129982565, + "flos": 27635954434560.0, + "grad_norm": 2.463330777144152, + "language_loss": 0.73889148, + "learning_rate": 3.505967198939873e-06, + "loss": 0.76059437, + "num_input_tokens_seen": 44704975, + "step": 2094, + "time_per_iteration": 2.62829852104187 + }, + { + "auxiliary_loss_clip": 0.01174546, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.05231702, + "balance_loss_mlp": 1.01917052, + "epoch": 0.25190885588889556, + "flos": 38104596529920.0, + "grad_norm": 1.9723883693400794, + "language_loss": 0.77958047, + "learning_rate": 3.5054544907809813e-06, + "loss": 0.80161142, + "num_input_tokens_seen": 44725475, + "step": 2095, + "time_per_iteration": 2.6456923484802246 + }, + { + "auxiliary_loss_clip": 0.01177222, + "auxiliary_loss_mlp": 0.00764623, + "balance_loss_clip": 1.05789363, + "balance_loss_mlp": 1.00136232, + "epoch": 0.25202909877953467, + "flos": 22269894768000.0, + "grad_norm": 2.785930989601767, + "language_loss": 0.80698156, + "learning_rate": 3.50494155424584e-06, + "loss": 0.82640004, + "num_input_tokens_seen": 44744380, + "step": 2096, + "time_per_iteration": 2.55389404296875 + }, + { + "auxiliary_loss_clip": 0.01198467, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.05908585, + "balance_loss_mlp": 1.02257681, + "epoch": 0.2521493416701738, + "flos": 21761759018880.0, + "grad_norm": 1.623382079299628, + "language_loss": 0.83419585, + "learning_rate": 3.504428389412262e-06, + "loss": 0.85650146, + "num_input_tokens_seen": 44765190, + "step": 2097, + "time_per_iteration": 2.52713942527771 + }, + { + "auxiliary_loss_clip": 0.01190967, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.05650091, + "balance_loss_mlp": 1.02393866, + "epoch": 0.25226958456081283, + "flos": 27746738956800.0, + "grad_norm": 2.2312354328634574, + "language_loss": 0.72665489, + "learning_rate": 3.5039149963580927e-06, + "loss": 0.74889559, + "num_input_tokens_seen": 44785210, + "step": 2098, + "time_per_iteration": 3.3202974796295166 + }, + { + "auxiliary_loss_clip": 0.01174854, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.05871892, + "balance_loss_mlp": 1.02432728, + "epoch": 0.25238982745145194, + "flos": 30732171903360.0, + "grad_norm": 2.281896727527845, + "language_loss": 0.70450383, + "learning_rate": 3.503401375161215e-06, + "loss": 0.72658676, + "num_input_tokens_seen": 44804955, + "step": 2099, + "time_per_iteration": 2.578850269317627 + }, + { + "auxiliary_loss_clip": 0.0120508, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.05811071, + "balance_loss_mlp": 1.0211755, + "epoch": 0.252510070342091, + "flos": 20266331068800.0, + "grad_norm": 1.7796924308867659, + "language_loss": 0.83597136, + "learning_rate": 3.502887525899544e-06, + "loss": 0.85832304, + "num_input_tokens_seen": 44823935, + "step": 2100, + "time_per_iteration": 2.46354341506958 + }, + { + "auxiliary_loss_clip": 0.01180265, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.05734277, + "balance_loss_mlp": 1.01944029, + "epoch": 0.2526303132327301, + "flos": 22747399194240.0, + "grad_norm": 1.681258631645701, + "language_loss": 0.82519603, + "learning_rate": 3.50237344865103e-06, + "loss": 0.84729064, + "num_input_tokens_seen": 44844935, + "step": 2101, + "time_per_iteration": 2.5621910095214844 + }, + { + "auxiliary_loss_clip": 0.01210722, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.06085944, + "balance_loss_mlp": 1.02814507, + "epoch": 0.2527505561233692, + "flos": 30263466309120.0, + "grad_norm": 3.0287617960629722, + "language_loss": 0.76394808, + "learning_rate": 3.501859143493658e-06, + "loss": 0.78642887, + "num_input_tokens_seen": 44865565, + "step": 2102, + "time_per_iteration": 2.5238852500915527 + }, + { + "auxiliary_loss_clip": 0.01127548, + "auxiliary_loss_mlp": 0.01004128, + "balance_loss_clip": 1.04237247, + "balance_loss_mlp": 1.00212514, + "epoch": 0.2528707990140083, + "flos": 58492917164160.0, + "grad_norm": 0.9198774542697352, + "language_loss": 0.60590291, + "learning_rate": 3.5013446105054488e-06, + "loss": 0.62721968, + "num_input_tokens_seen": 44918485, + "step": 2103, + "time_per_iteration": 2.8064167499542236 + }, + { + "auxiliary_loss_clip": 0.01149734, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.053478, + "balance_loss_mlp": 1.02671242, + "epoch": 0.2529910419046474, + "flos": 24645134448000.0, + "grad_norm": 2.689947959380527, + "language_loss": 0.74874812, + "learning_rate": 3.5008298497644555e-06, + "loss": 0.77060741, + "num_input_tokens_seen": 44937530, + "step": 2104, + "time_per_iteration": 3.510887384414673 + }, + { + "auxiliary_loss_clip": 0.01168663, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.05809307, + "balance_loss_mlp": 1.02478135, + "epoch": 0.2531112847952865, + "flos": 23842135952640.0, + "grad_norm": 1.620817927311642, + "language_loss": 0.88023871, + "learning_rate": 3.500314861348767e-06, + "loss": 0.90227336, + "num_input_tokens_seen": 44958165, + "step": 2105, + "time_per_iteration": 3.3672237396240234 + }, + { + "auxiliary_loss_clip": 0.01157978, + "auxiliary_loss_mlp": 0.01036351, + "balance_loss_clip": 1.05668521, + "balance_loss_mlp": 1.02749956, + "epoch": 0.25323152768592555, + "flos": 16143822207360.0, + "grad_norm": 1.7870348900076634, + "language_loss": 0.76794493, + "learning_rate": 3.499799645336507e-06, + "loss": 0.78988814, + "num_input_tokens_seen": 44975060, + "step": 2106, + "time_per_iteration": 2.5176327228546143 + }, + { + "auxiliary_loss_clip": 0.01198163, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.06277585, + "balance_loss_mlp": 1.02108669, + "epoch": 0.25335177057656466, + "flos": 28405161210240.0, + "grad_norm": 1.560940088177471, + "language_loss": 0.87272751, + "learning_rate": 3.4992842018058336e-06, + "loss": 0.89500576, + "num_input_tokens_seen": 44997960, + "step": 2107, + "time_per_iteration": 2.5629451274871826 + }, + { + "auxiliary_loss_clip": 0.01170272, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.05606508, + "balance_loss_mlp": 1.02040446, + "epoch": 0.25347201346720377, + "flos": 18799666934400.0, + "grad_norm": 2.247012906264391, + "language_loss": 0.88612056, + "learning_rate": 3.4987685308349384e-06, + "loss": 0.90811729, + "num_input_tokens_seen": 45015690, + "step": 2108, + "time_per_iteration": 2.551663875579834 + }, + { + "auxiliary_loss_clip": 0.01162662, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.05203533, + "balance_loss_mlp": 1.02670062, + "epoch": 0.2535922563578428, + "flos": 15815490963840.0, + "grad_norm": 2.1902859430760744, + "language_loss": 0.61364943, + "learning_rate": 3.4982526325020497e-06, + "loss": 0.63563836, + "num_input_tokens_seen": 45032660, + "step": 2109, + "time_per_iteration": 2.5311169624328613 + }, + { + "auxiliary_loss_clip": 0.01184227, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.06005657, + "balance_loss_mlp": 1.02185011, + "epoch": 0.25371249924848194, + "flos": 16318922031360.0, + "grad_norm": 2.5398507535148274, + "language_loss": 0.8234089, + "learning_rate": 3.4977365068854273e-06, + "loss": 0.84556818, + "num_input_tokens_seen": 45048280, + "step": 2110, + "time_per_iteration": 2.4695026874542236 + }, + { + "auxiliary_loss_clip": 0.01174229, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.05703449, + "balance_loss_mlp": 1.02419376, + "epoch": 0.25383274213912105, + "flos": 21761615364480.0, + "grad_norm": 1.7644066351984797, + "language_loss": 0.73649764, + "learning_rate": 3.4972201540633676e-06, + "loss": 0.75857961, + "num_input_tokens_seen": 45067635, + "step": 2111, + "time_per_iteration": 2.526698350906372 + }, + { + "auxiliary_loss_clip": 0.01169871, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.05546629, + "balance_loss_mlp": 1.02351904, + "epoch": 0.2539529850297601, + "flos": 21396870708480.0, + "grad_norm": 1.7543738315816078, + "language_loss": 0.85379744, + "learning_rate": 3.4967035741142008e-06, + "loss": 0.87583244, + "num_input_tokens_seen": 45086455, + "step": 2112, + "time_per_iteration": 2.527505874633789 + }, + { + "auxiliary_loss_clip": 0.01171342, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.06256962, + "balance_loss_mlp": 1.02473426, + "epoch": 0.2540732279203992, + "flos": 25228467319680.0, + "grad_norm": 1.823329256966325, + "language_loss": 0.81955135, + "learning_rate": 3.4961867671162917e-06, + "loss": 0.84160179, + "num_input_tokens_seen": 45106385, + "step": 2113, + "time_per_iteration": 2.5422539710998535 + }, + { + "auxiliary_loss_clip": 0.0121323, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.06257617, + "balance_loss_mlp": 1.02240777, + "epoch": 0.2541934708110383, + "flos": 19427386037760.0, + "grad_norm": 2.452121420849565, + "language_loss": 0.77036297, + "learning_rate": 3.4956697331480402e-06, + "loss": 0.79281831, + "num_input_tokens_seen": 45124955, + "step": 2114, + "time_per_iteration": 2.428863525390625 + }, + { + "auxiliary_loss_clip": 0.01171696, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.05643868, + "balance_loss_mlp": 1.02305675, + "epoch": 0.2543137137016774, + "flos": 23949436855680.0, + "grad_norm": 1.7019061444726813, + "language_loss": 0.79976726, + "learning_rate": 3.495152472287879e-06, + "loss": 0.82181227, + "num_input_tokens_seen": 45145665, + "step": 2115, + "time_per_iteration": 2.6080031394958496 + }, + { + "auxiliary_loss_clip": 0.01165409, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.05810928, + "balance_loss_mlp": 1.02141404, + "epoch": 0.2544339565923165, + "flos": 25593283802880.0, + "grad_norm": 1.8134489290409792, + "language_loss": 0.73761773, + "learning_rate": 3.4946349846142766e-06, + "loss": 0.75957233, + "num_input_tokens_seen": 45164805, + "step": 2116, + "time_per_iteration": 2.5908701419830322 + }, + { + "auxiliary_loss_clip": 0.01210647, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0630033, + "balance_loss_mlp": 1.02992618, + "epoch": 0.25455419948295555, + "flos": 21689470897920.0, + "grad_norm": 2.714984083260144, + "language_loss": 0.7545352, + "learning_rate": 3.4941172702057353e-06, + "loss": 0.77703261, + "num_input_tokens_seen": 45184865, + "step": 2117, + "time_per_iteration": 2.5367674827575684 + }, + { + "auxiliary_loss_clip": 0.01179643, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.06030905, + "balance_loss_mlp": 1.02409899, + "epoch": 0.25467444237359466, + "flos": 26250341339520.0, + "grad_norm": 1.8224707180842552, + "language_loss": 0.80697495, + "learning_rate": 3.4935993291407924e-06, + "loss": 0.82910645, + "num_input_tokens_seen": 45203690, + "step": 2118, + "time_per_iteration": 2.5747146606445312 + }, + { + "auxiliary_loss_clip": 0.01177731, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.05787659, + "balance_loss_mlp": 1.02348089, + "epoch": 0.25479468526423377, + "flos": 26979686997120.0, + "grad_norm": 2.3822660401443474, + "language_loss": 0.70912182, + "learning_rate": 3.4930811614980183e-06, + "loss": 0.73123294, + "num_input_tokens_seen": 45225385, + "step": 2119, + "time_per_iteration": 2.5754342079162598 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.05858421, + "balance_loss_mlp": 1.02682686, + "epoch": 0.2549149281548728, + "flos": 23475811098240.0, + "grad_norm": 1.8970964193664563, + "language_loss": 0.79338145, + "learning_rate": 3.4925627673560198e-06, + "loss": 0.81562328, + "num_input_tokens_seen": 45246045, + "step": 2120, + "time_per_iteration": 2.522830009460449 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.05677044, + "balance_loss_mlp": 1.02977741, + "epoch": 0.25503517104551193, + "flos": 25812302981760.0, + "grad_norm": 2.661072957673536, + "language_loss": 0.88428557, + "learning_rate": 3.4920441467934357e-06, + "loss": 0.90632659, + "num_input_tokens_seen": 45266560, + "step": 2121, + "time_per_iteration": 2.587238311767578 + }, + { + "auxiliary_loss_clip": 0.01157619, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.05562901, + "balance_loss_mlp": 1.02777994, + "epoch": 0.25515541393615104, + "flos": 26645106787200.0, + "grad_norm": 1.966157032080573, + "language_loss": 0.82839191, + "learning_rate": 3.491525299888941e-06, + "loss": 0.85033631, + "num_input_tokens_seen": 45285405, + "step": 2122, + "time_per_iteration": 2.6034395694732666 + }, + { + "auxiliary_loss_clip": 0.01094853, + "auxiliary_loss_mlp": 0.00754878, + "balance_loss_clip": 1.04400086, + "balance_loss_mlp": 1.00095141, + "epoch": 0.2552756568267901, + "flos": 65955945847680.0, + "grad_norm": 0.8849975689910463, + "language_loss": 0.62691009, + "learning_rate": 3.491006226721244e-06, + "loss": 0.64540744, + "num_input_tokens_seen": 45349615, + "step": 2123, + "time_per_iteration": 3.1456305980682373 + }, + { + "auxiliary_loss_clip": 0.01186754, + "auxiliary_loss_mlp": 0.007643, + "balance_loss_clip": 1.06344199, + "balance_loss_mlp": 1.0012691, + "epoch": 0.2553958997174292, + "flos": 17931096161280.0, + "grad_norm": 1.8908626252248844, + "language_loss": 0.77408051, + "learning_rate": 3.4904869273690882e-06, + "loss": 0.79359102, + "num_input_tokens_seen": 45367505, + "step": 2124, + "time_per_iteration": 2.534048318862915 + }, + { + "auxiliary_loss_clip": 0.01198786, + "auxiliary_loss_mlp": 0.01026167, + "balance_loss_clip": 1.06181324, + "balance_loss_mlp": 1.01729155, + "epoch": 0.2555161426080683, + "flos": 23367791923200.0, + "grad_norm": 2.7644258258860352, + "language_loss": 0.88696647, + "learning_rate": 3.489967401911251e-06, + "loss": 0.90921593, + "num_input_tokens_seen": 45386805, + "step": 2125, + "time_per_iteration": 3.295128583908081 + }, + { + "auxiliary_loss_clip": 0.0121717, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.06608415, + "balance_loss_mlp": 1.02309442, + "epoch": 0.2556363854987074, + "flos": 40625130723840.0, + "grad_norm": 1.8508022066010383, + "language_loss": 0.6953373, + "learning_rate": 3.4894476504265428e-06, + "loss": 0.71784872, + "num_input_tokens_seen": 45411045, + "step": 2126, + "time_per_iteration": 2.6433393955230713 + }, + { + "auxiliary_loss_clip": 0.01115047, + "auxiliary_loss_mlp": 0.01004231, + "balance_loss_clip": 1.04365349, + "balance_loss_mlp": 1.00169134, + "epoch": 0.2557566283893465, + "flos": 68019443389440.0, + "grad_norm": 0.7405179474258614, + "language_loss": 0.5443002, + "learning_rate": 3.4889276729938104e-06, + "loss": 0.56549299, + "num_input_tokens_seen": 45469575, + "step": 2127, + "time_per_iteration": 2.9664998054504395 + }, + { + "auxiliary_loss_clip": 0.01174716, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.05700588, + "balance_loss_mlp": 1.01780152, + "epoch": 0.2558768712799856, + "flos": 22635645004800.0, + "grad_norm": 1.9136250464817475, + "language_loss": 0.80596977, + "learning_rate": 3.488407469691934e-06, + "loss": 0.82799447, + "num_input_tokens_seen": 45490270, + "step": 2128, + "time_per_iteration": 2.522911787033081 + }, + { + "auxiliary_loss_clip": 0.01179273, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.05745125, + "balance_loss_mlp": 1.02284241, + "epoch": 0.25599711417062465, + "flos": 26396354125440.0, + "grad_norm": 2.349254643827396, + "language_loss": 0.80623996, + "learning_rate": 3.487887040599828e-06, + "loss": 0.82835782, + "num_input_tokens_seen": 45510070, + "step": 2129, + "time_per_iteration": 2.5704989433288574 + }, + { + "auxiliary_loss_clip": 0.01213838, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.06481028, + "balance_loss_mlp": 1.02570391, + "epoch": 0.25611735706126376, + "flos": 22852042490880.0, + "grad_norm": 2.2105955077988324, + "language_loss": 0.7597664, + "learning_rate": 3.4873663857964407e-06, + "loss": 0.78226113, + "num_input_tokens_seen": 45527285, + "step": 2130, + "time_per_iteration": 3.4133925437927246 + }, + { + "auxiliary_loss_clip": 0.011489, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.05491734, + "balance_loss_mlp": 1.02495933, + "epoch": 0.2562375999519028, + "flos": 23367863750400.0, + "grad_norm": 1.7716620352230272, + "language_loss": 0.66543591, + "learning_rate": 3.4868455053607556e-06, + "loss": 0.68727589, + "num_input_tokens_seen": 45546900, + "step": 2131, + "time_per_iteration": 3.3764991760253906 + }, + { + "auxiliary_loss_clip": 0.01199258, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.05922198, + "balance_loss_mlp": 1.02603614, + "epoch": 0.2563578428425419, + "flos": 22856962654080.0, + "grad_norm": 3.2073863762838473, + "language_loss": 0.72177035, + "learning_rate": 3.486324399371789e-06, + "loss": 0.74411976, + "num_input_tokens_seen": 45566200, + "step": 2132, + "time_per_iteration": 2.498868942260742 + }, + { + "auxiliary_loss_clip": 0.01161479, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.05672216, + "balance_loss_mlp": 1.02896965, + "epoch": 0.25647808573318104, + "flos": 21653883498240.0, + "grad_norm": 1.9541045187352488, + "language_loss": 0.78494954, + "learning_rate": 3.485803067908593e-06, + "loss": 0.80694437, + "num_input_tokens_seen": 45585710, + "step": 2133, + "time_per_iteration": 2.546391487121582 + }, + { + "auxiliary_loss_clip": 0.01110771, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.04474068, + "balance_loss_mlp": 1.02492917, + "epoch": 0.2565983286238201, + "flos": 33730569659520.0, + "grad_norm": 1.8062645234911745, + "language_loss": 0.79570651, + "learning_rate": 3.485281511050253e-06, + "loss": 0.81716096, + "num_input_tokens_seen": 45607845, + "step": 2134, + "time_per_iteration": 2.7235312461853027 + }, + { + "auxiliary_loss_clip": 0.01199729, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.06069183, + "balance_loss_mlp": 1.02504539, + "epoch": 0.2567185715144592, + "flos": 16216002587520.0, + "grad_norm": 3.6119305721229447, + "language_loss": 0.89920598, + "learning_rate": 3.484759728875889e-06, + "loss": 0.92155039, + "num_input_tokens_seen": 45623210, + "step": 2135, + "time_per_iteration": 2.4774370193481445 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.05229926, + "balance_loss_mlp": 1.0283016, + "epoch": 0.2568388144050983, + "flos": 17458475984640.0, + "grad_norm": 1.7588320643640283, + "language_loss": 0.80961126, + "learning_rate": 3.4842377214646543e-06, + "loss": 0.83135414, + "num_input_tokens_seen": 45641505, + "step": 2136, + "time_per_iteration": 2.571418046951294 + }, + { + "auxiliary_loss_clip": 0.01209165, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.0621078, + "balance_loss_mlp": 1.02619827, + "epoch": 0.25695905729573737, + "flos": 20887442069760.0, + "grad_norm": 1.7296693757860617, + "language_loss": 0.66624212, + "learning_rate": 3.483715488895737e-06, + "loss": 0.68868637, + "num_input_tokens_seen": 45661835, + "step": 2137, + "time_per_iteration": 2.4665029048919678 + }, + { + "auxiliary_loss_clip": 0.01147459, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.05019176, + "balance_loss_mlp": 1.02044892, + "epoch": 0.2570793001863765, + "flos": 24717278914560.0, + "grad_norm": 2.1795694889333523, + "language_loss": 0.78334129, + "learning_rate": 3.48319303124836e-06, + "loss": 0.80511558, + "num_input_tokens_seen": 45682215, + "step": 2138, + "time_per_iteration": 2.625080108642578 + }, + { + "auxiliary_loss_clip": 0.0117557, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.05925131, + "balance_loss_mlp": 1.0218612, + "epoch": 0.2571995430770156, + "flos": 26906896085760.0, + "grad_norm": 2.085022932892146, + "language_loss": 0.66648912, + "learning_rate": 3.4826703486017798e-06, + "loss": 0.68855721, + "num_input_tokens_seen": 45701840, + "step": 2139, + "time_per_iteration": 2.5612897872924805 + }, + { + "auxiliary_loss_clip": 0.01192519, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.06067789, + "balance_loss_mlp": 1.02156854, + "epoch": 0.25731978596765465, + "flos": 19792561656960.0, + "grad_norm": 1.6318146761712238, + "language_loss": 0.76885098, + "learning_rate": 3.4821474410352867e-06, + "loss": 0.79108107, + "num_input_tokens_seen": 45720500, + "step": 2140, + "time_per_iteration": 2.556918144226074 + }, + { + "auxiliary_loss_clip": 0.01084958, + "auxiliary_loss_mlp": 0.01005569, + "balance_loss_clip": 1.03921795, + "balance_loss_mlp": 1.00239778, + "epoch": 0.25744002885829376, + "flos": 70564970471040.0, + "grad_norm": 0.8999623557472874, + "language_loss": 0.62621933, + "learning_rate": 3.481624308628205e-06, + "loss": 0.64712459, + "num_input_tokens_seen": 45781870, + "step": 2141, + "time_per_iteration": 3.2933452129364014 + }, + { + "auxiliary_loss_clip": 0.01177322, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.05670762, + "balance_loss_mlp": 1.02414823, + "epoch": 0.25756027174893287, + "flos": 18038181582720.0, + "grad_norm": 3.212182069857203, + "language_loss": 1.00315654, + "learning_rate": 3.481100951459893e-06, + "loss": 1.02526641, + "num_input_tokens_seen": 45794890, + "step": 2142, + "time_per_iteration": 2.534937620162964 + }, + { + "auxiliary_loss_clip": 0.01191474, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.058568, + "balance_loss_mlp": 1.0217663, + "epoch": 0.2576805146395719, + "flos": 22674069578880.0, + "grad_norm": 1.5899786862721246, + "language_loss": 0.78834361, + "learning_rate": 3.4805773696097453e-06, + "loss": 0.81057012, + "num_input_tokens_seen": 45815780, + "step": 2143, + "time_per_iteration": 2.525907039642334 + }, + { + "auxiliary_loss_clip": 0.01174866, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.05991483, + "balance_loss_mlp": 1.01961243, + "epoch": 0.25780075753021103, + "flos": 16472225278080.0, + "grad_norm": 1.9527030131637773, + "language_loss": 0.87907928, + "learning_rate": 3.4800535631571874e-06, + "loss": 0.90112025, + "num_input_tokens_seen": 45831310, + "step": 2144, + "time_per_iteration": 2.5082976818084717 + }, + { + "auxiliary_loss_clip": 0.01183989, + "auxiliary_loss_mlp": 0.01037565, + "balance_loss_clip": 1.05801511, + "balance_loss_mlp": 1.0278194, + "epoch": 0.25792100042085014, + "flos": 22820297846400.0, + "grad_norm": 2.493707492825981, + "language_loss": 0.76478982, + "learning_rate": 3.4795295321816804e-06, + "loss": 0.78700531, + "num_input_tokens_seen": 45850135, + "step": 2145, + "time_per_iteration": 2.529543876647949 + }, + { + "auxiliary_loss_clip": 0.01168256, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.05690563, + "balance_loss_mlp": 1.02841663, + "epoch": 0.2580412433114892, + "flos": 18697286194560.0, + "grad_norm": 3.393185209914697, + "language_loss": 0.91164839, + "learning_rate": 3.47900527676272e-06, + "loss": 0.93370974, + "num_input_tokens_seen": 45868470, + "step": 2146, + "time_per_iteration": 2.5117640495300293 + }, + { + "auxiliary_loss_clip": 0.01211428, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.06449318, + "balance_loss_mlp": 1.02572691, + "epoch": 0.2581614862021283, + "flos": 14283146810880.0, + "grad_norm": 1.9667345444962852, + "language_loss": 0.8832534, + "learning_rate": 3.478480796979835e-06, + "loss": 0.90571964, + "num_input_tokens_seen": 45886355, + "step": 2147, + "time_per_iteration": 2.446657657623291 + }, + { + "auxiliary_loss_clip": 0.01174209, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.05717802, + "balance_loss_mlp": 1.01895952, + "epoch": 0.25828172909276736, + "flos": 29498281856640.0, + "grad_norm": 1.4975642819999468, + "language_loss": 0.7757293, + "learning_rate": 3.4779560929125894e-06, + "loss": 0.79775262, + "num_input_tokens_seen": 45907900, + "step": 2148, + "time_per_iteration": 2.6038734912872314 + }, + { + "auxiliary_loss_clip": 0.01089575, + "auxiliary_loss_mlp": 0.01005444, + "balance_loss_clip": 1.04155564, + "balance_loss_mlp": 1.00286889, + "epoch": 0.2584019719834065, + "flos": 67114387376640.0, + "grad_norm": 0.6799228745901635, + "language_loss": 0.56979376, + "learning_rate": 3.4774311646405783e-06, + "loss": 0.5907439, + "num_input_tokens_seen": 45977805, + "step": 2149, + "time_per_iteration": 3.242149591445923 + }, + { + "auxiliary_loss_clip": 0.0115538, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.05454135, + "balance_loss_mlp": 1.02326965, + "epoch": 0.2585222148740456, + "flos": 22893555634560.0, + "grad_norm": 2.2430496661563573, + "language_loss": 0.83480787, + "learning_rate": 3.476906012243435e-06, + "loss": 0.85668153, + "num_input_tokens_seen": 45996715, + "step": 2150, + "time_per_iteration": 2.583371877670288 + }, + { + "auxiliary_loss_clip": 0.01184199, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.06057811, + "balance_loss_mlp": 1.01930368, + "epoch": 0.25864245776468464, + "flos": 28909202808960.0, + "grad_norm": 1.6246665045476154, + "language_loss": 0.81386536, + "learning_rate": 3.476380635800824e-06, + "loss": 0.83599567, + "num_input_tokens_seen": 46017915, + "step": 2151, + "time_per_iteration": 2.6141316890716553 + }, + { + "auxiliary_loss_clip": 0.011777, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.05898774, + "balance_loss_mlp": 1.02232063, + "epoch": 0.25876270065532375, + "flos": 14793185980800.0, + "grad_norm": 2.0830984784151703, + "language_loss": 0.86036265, + "learning_rate": 3.475855035392444e-06, + "loss": 0.88245124, + "num_input_tokens_seen": 46033235, + "step": 2152, + "time_per_iteration": 3.8390533924102783 + }, + { + "auxiliary_loss_clip": 0.01131826, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.05437434, + "balance_loss_mlp": 1.02089381, + "epoch": 0.25888294354596286, + "flos": 60467821810560.0, + "grad_norm": 2.08306049660786, + "language_loss": 0.71569443, + "learning_rate": 3.475329211098029e-06, + "loss": 0.73731083, + "num_input_tokens_seen": 46056390, + "step": 2153, + "time_per_iteration": 2.9688358306884766 + }, + { + "auxiliary_loss_clip": 0.01152508, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.05670452, + "balance_loss_mlp": 1.01941359, + "epoch": 0.2590031864366019, + "flos": 27851166771840.0, + "grad_norm": 1.5582507213354646, + "language_loss": 0.82287574, + "learning_rate": 3.4748031629973453e-06, + "loss": 0.84468639, + "num_input_tokens_seen": 46077120, + "step": 2154, + "time_per_iteration": 2.6567227840423584 + }, + { + "auxiliary_loss_clip": 0.01073611, + "auxiliary_loss_mlp": 0.01003855, + "balance_loss_clip": 1.04009473, + "balance_loss_mlp": 1.00209069, + "epoch": 0.25912342932724103, + "flos": 62422444206720.0, + "grad_norm": 0.9153383801395144, + "language_loss": 0.56557369, + "learning_rate": 3.4742768911701944e-06, + "loss": 0.58634841, + "num_input_tokens_seen": 46139815, + "step": 2155, + "time_per_iteration": 3.23642897605896 + }, + { + "auxiliary_loss_clip": 0.0120126, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.06386042, + "balance_loss_mlp": 1.0291332, + "epoch": 0.25924367221788014, + "flos": 12378839368320.0, + "grad_norm": 3.071590148759398, + "language_loss": 0.69918042, + "learning_rate": 3.4737503956964113e-06, + "loss": 0.72159344, + "num_input_tokens_seen": 46152120, + "step": 2156, + "time_per_iteration": 4.235851764678955 + }, + { + "auxiliary_loss_clip": 0.01172114, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.05689108, + "balance_loss_mlp": 1.02848077, + "epoch": 0.2593639151085192, + "flos": 14575208296320.0, + "grad_norm": 3.177376680075476, + "language_loss": 0.67230749, + "learning_rate": 3.473223676655865e-06, + "loss": 0.69441658, + "num_input_tokens_seen": 46170120, + "step": 2157, + "time_per_iteration": 3.321575164794922 + }, + { + "auxiliary_loss_clip": 0.01171035, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.05484676, + "balance_loss_mlp": 1.02372575, + "epoch": 0.2594841579991583, + "flos": 15230937029760.0, + "grad_norm": 1.986048264940477, + "language_loss": 0.79678667, + "learning_rate": 3.472696734128459e-06, + "loss": 0.8188386, + "num_input_tokens_seen": 46187985, + "step": 2158, + "time_per_iteration": 2.5242321491241455 + }, + { + "auxiliary_loss_clip": 0.0119552, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.06124496, + "balance_loss_mlp": 1.02183914, + "epoch": 0.2596044008897974, + "flos": 23623583650560.0, + "grad_norm": 2.572027662561668, + "language_loss": 0.75841039, + "learning_rate": 3.4721695681941286e-06, + "loss": 0.78067923, + "num_input_tokens_seen": 46207025, + "step": 2159, + "time_per_iteration": 2.5904955863952637 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.00764758, + "balance_loss_clip": 1.05667567, + "balance_loss_mlp": 1.00088191, + "epoch": 0.25972464378043647, + "flos": 13772281628160.0, + "grad_norm": 3.556448341155961, + "language_loss": 0.82570046, + "learning_rate": 3.471642178932845e-06, + "loss": 0.84510577, + "num_input_tokens_seen": 46225670, + "step": 2160, + "time_per_iteration": 2.5687167644500732 + }, + { + "auxiliary_loss_clip": 0.01180046, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.05669641, + "balance_loss_mlp": 1.02163255, + "epoch": 0.2598448866710756, + "flos": 19573578391680.0, + "grad_norm": 2.3503553334861516, + "language_loss": 0.89278805, + "learning_rate": 3.471114566424613e-06, + "loss": 0.91489816, + "num_input_tokens_seen": 46244130, + "step": 2161, + "time_per_iteration": 2.524306297302246 + }, + { + "auxiliary_loss_clip": 0.01178501, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.06000483, + "balance_loss_mlp": 1.02081084, + "epoch": 0.25996512956171464, + "flos": 21653237053440.0, + "grad_norm": 1.868592188369467, + "language_loss": 0.76046926, + "learning_rate": 3.4705867307494715e-06, + "loss": 0.78256261, + "num_input_tokens_seen": 46263200, + "step": 2162, + "time_per_iteration": 2.550487756729126 + }, + { + "auxiliary_loss_clip": 0.01197357, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.06076562, + "balance_loss_mlp": 1.02163744, + "epoch": 0.26008537245235375, + "flos": 18223480869120.0, + "grad_norm": 2.1920378979209243, + "language_loss": 0.84468186, + "learning_rate": 3.470058671987492e-06, + "loss": 0.86696231, + "num_input_tokens_seen": 46281465, + "step": 2163, + "time_per_iteration": 2.4827804565429688 + }, + { + "auxiliary_loss_clip": 0.01198144, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.05964136, + "balance_loss_mlp": 1.02676725, + "epoch": 0.26020561534299286, + "flos": 24645385843200.0, + "grad_norm": 1.9838805671554751, + "language_loss": 0.84219944, + "learning_rate": 3.4695303902187805e-06, + "loss": 0.86454946, + "num_input_tokens_seen": 46301020, + "step": 2164, + "time_per_iteration": 2.576317310333252 + }, + { + "auxiliary_loss_clip": 0.01159739, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.05413914, + "balance_loss_mlp": 1.02752924, + "epoch": 0.2603258582336319, + "flos": 25773662926080.0, + "grad_norm": 2.0365316758181566, + "language_loss": 0.78617871, + "learning_rate": 3.469001885523478e-06, + "loss": 0.80815184, + "num_input_tokens_seen": 46321740, + "step": 2165, + "time_per_iteration": 2.676100254058838 + }, + { + "auxiliary_loss_clip": 0.01206667, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.06028962, + "balance_loss_mlp": 1.03142488, + "epoch": 0.260446101124271, + "flos": 28766314506240.0, + "grad_norm": 1.6983852959484727, + "language_loss": 0.81188893, + "learning_rate": 3.4684731579817568e-06, + "loss": 0.83436918, + "num_input_tokens_seen": 46342730, + "step": 2166, + "time_per_iteration": 2.53603458404541 + }, + { + "auxiliary_loss_clip": 0.01132057, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.0555687, + "balance_loss_mlp": 1.02784848, + "epoch": 0.26056634401491013, + "flos": 25666757072640.0, + "grad_norm": 1.5626522140039851, + "language_loss": 0.76280355, + "learning_rate": 3.4679442076738247e-06, + "loss": 0.78449392, + "num_input_tokens_seen": 46362445, + "step": 2167, + "time_per_iteration": 2.657926082611084 + }, + { + "auxiliary_loss_clip": 0.01212013, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.06358683, + "balance_loss_mlp": 1.02368653, + "epoch": 0.2606865869055492, + "flos": 27052765217280.0, + "grad_norm": 2.0148815023432887, + "language_loss": 0.83762741, + "learning_rate": 3.4674150346799245e-06, + "loss": 0.86008584, + "num_input_tokens_seen": 46382145, + "step": 2168, + "time_per_iteration": 2.4923644065856934 + }, + { + "auxiliary_loss_clip": 0.01177443, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.05903304, + "balance_loss_mlp": 1.0215435, + "epoch": 0.2608068297961883, + "flos": 17712615686400.0, + "grad_norm": 2.297595910295783, + "language_loss": 0.79767686, + "learning_rate": 3.4668856390803295e-06, + "loss": 0.81975877, + "num_input_tokens_seen": 46400025, + "step": 2169, + "time_per_iteration": 2.5130422115325928 + }, + { + "auxiliary_loss_clip": 0.01183175, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.060076, + "balance_loss_mlp": 1.02166498, + "epoch": 0.2609270726868274, + "flos": 18551632544640.0, + "grad_norm": 1.8567906303704276, + "language_loss": 0.89915407, + "learning_rate": 3.4663560209553495e-06, + "loss": 0.92129445, + "num_input_tokens_seen": 46418090, + "step": 2170, + "time_per_iteration": 2.4837265014648438 + }, + { + "auxiliary_loss_clip": 0.01168994, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.05658078, + "balance_loss_mlp": 1.023579, + "epoch": 0.26104731557746647, + "flos": 21835699165440.0, + "grad_norm": 1.6486232169764274, + "language_loss": 0.79310489, + "learning_rate": 3.4658261803853267e-06, + "loss": 0.81512219, + "num_input_tokens_seen": 46436015, + "step": 2171, + "time_per_iteration": 2.5603272914886475 + }, + { + "auxiliary_loss_clip": 0.01175017, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.05923963, + "balance_loss_mlp": 1.02346206, + "epoch": 0.2611675584681056, + "flos": 21689650465920.0, + "grad_norm": 7.403395318207266, + "language_loss": 0.80895674, + "learning_rate": 3.4652961174506383e-06, + "loss": 0.83103752, + "num_input_tokens_seen": 46455885, + "step": 2172, + "time_per_iteration": 2.526472330093384 + }, + { + "auxiliary_loss_clip": 0.01120973, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.05513811, + "balance_loss_mlp": 1.00016582, + "epoch": 0.2612878013587447, + "flos": 71862101389440.0, + "grad_norm": 0.9712397895893216, + "language_loss": 0.58154279, + "learning_rate": 3.464765832231694e-06, + "loss": 0.60277063, + "num_input_tokens_seen": 46510050, + "step": 2173, + "time_per_iteration": 3.0684995651245117 + }, + { + "auxiliary_loss_clip": 0.01196592, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.06342673, + "balance_loss_mlp": 1.01979637, + "epoch": 0.26140804424938374, + "flos": 20227511445120.0, + "grad_norm": 1.870311859634478, + "language_loss": 0.7064085, + "learning_rate": 3.4642353248089373e-06, + "loss": 0.72866356, + "num_input_tokens_seen": 46528810, + "step": 2174, + "time_per_iteration": 2.4820303916931152 + }, + { + "auxiliary_loss_clip": 0.01173217, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.05699348, + "balance_loss_mlp": 1.01954424, + "epoch": 0.26152828714002285, + "flos": 25557085872000.0, + "grad_norm": 1.629799740219285, + "language_loss": 0.80035865, + "learning_rate": 3.463704595262846e-06, + "loss": 0.82238376, + "num_input_tokens_seen": 46549690, + "step": 2175, + "time_per_iteration": 2.5791454315185547 + }, + { + "auxiliary_loss_clip": 0.01160216, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.05637741, + "balance_loss_mlp": 1.02717173, + "epoch": 0.26164853003066196, + "flos": 25446516831360.0, + "grad_norm": 2.072702741122209, + "language_loss": 0.70460141, + "learning_rate": 3.463173643673931e-06, + "loss": 0.72656536, + "num_input_tokens_seen": 46572215, + "step": 2176, + "time_per_iteration": 2.6131815910339355 + }, + { + "auxiliary_loss_clip": 0.01128957, + "auxiliary_loss_mlp": 0.0100988, + "balance_loss_clip": 1.05613852, + "balance_loss_mlp": 1.00804412, + "epoch": 0.261768772921301, + "flos": 53944580568960.0, + "grad_norm": 0.9016404712133905, + "language_loss": 0.63482016, + "learning_rate": 3.4626424701227387e-06, + "loss": 0.65620857, + "num_input_tokens_seen": 46627275, + "step": 2177, + "time_per_iteration": 3.012408971786499 + }, + { + "auxiliary_loss_clip": 0.01138466, + "auxiliary_loss_mlp": 0.0100753, + "balance_loss_clip": 1.05523539, + "balance_loss_mlp": 1.00567079, + "epoch": 0.26188901581194013, + "flos": 70687606481280.0, + "grad_norm": 0.8200781995043724, + "language_loss": 0.55837572, + "learning_rate": 3.4621110746898452e-06, + "loss": 0.57983571, + "num_input_tokens_seen": 46695135, + "step": 2178, + "time_per_iteration": 4.358171701431274 + }, + { + "auxiliary_loss_clip": 0.01197398, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.06295514, + "balance_loss_mlp": 1.02277136, + "epoch": 0.2620092587025792, + "flos": 21069580959360.0, + "grad_norm": 1.6563492262859696, + "language_loss": 0.74328589, + "learning_rate": 3.4615794574558654e-06, + "loss": 0.76558173, + "num_input_tokens_seen": 46714145, + "step": 2179, + "time_per_iteration": 2.561577796936035 + }, + { + "auxiliary_loss_clip": 0.01178204, + "auxiliary_loss_mlp": 0.01025749, + "balance_loss_clip": 1.05941629, + "balance_loss_mlp": 1.01761293, + "epoch": 0.2621295015932183, + "flos": 18369601395840.0, + "grad_norm": 3.124222071143197, + "language_loss": 0.83916706, + "learning_rate": 3.4610476185014436e-06, + "loss": 0.86120659, + "num_input_tokens_seen": 46731405, + "step": 2180, + "time_per_iteration": 2.5642709732055664 + }, + { + "auxiliary_loss_clip": 0.01208678, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.06085014, + "balance_loss_mlp": 1.02179396, + "epoch": 0.2622497444838574, + "flos": 23659997063040.0, + "grad_norm": 1.8866342374167722, + "language_loss": 0.79257935, + "learning_rate": 3.4605155579072597e-06, + "loss": 0.81498241, + "num_input_tokens_seen": 46751260, + "step": 2181, + "time_per_iteration": 2.545687437057495 + }, + { + "auxiliary_loss_clip": 0.01138863, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.05171895, + "balance_loss_mlp": 1.02037311, + "epoch": 0.26236998737449646, + "flos": 22123810154880.0, + "grad_norm": 1.7193284635237824, + "language_loss": 0.71557009, + "learning_rate": 3.459983275754027e-06, + "loss": 0.73725224, + "num_input_tokens_seen": 46770155, + "step": 2182, + "time_per_iteration": 2.701566219329834 + }, + { + "auxiliary_loss_clip": 0.01208017, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.06258309, + "balance_loss_mlp": 1.02051926, + "epoch": 0.26249023026513557, + "flos": 17895185539200.0, + "grad_norm": 2.709506957094928, + "language_loss": 0.80074733, + "learning_rate": 3.4594507721224918e-06, + "loss": 0.82312316, + "num_input_tokens_seen": 46788805, + "step": 2183, + "time_per_iteration": 3.352287530899048 + }, + { + "auxiliary_loss_clip": 0.01178987, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_clip": 1.05708432, + "balance_loss_mlp": 1.03295064, + "epoch": 0.2626104731557747, + "flos": 18332936588160.0, + "grad_norm": 1.636558240169881, + "language_loss": 0.82141447, + "learning_rate": 3.4589180470934353e-06, + "loss": 0.84362358, + "num_input_tokens_seen": 46808670, + "step": 2184, + "time_per_iteration": 4.191047191619873 + }, + { + "auxiliary_loss_clip": 0.01199236, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.05877709, + "balance_loss_mlp": 1.0253948, + "epoch": 0.26273071604641374, + "flos": 19317714837120.0, + "grad_norm": 1.7760122200612447, + "language_loss": 0.76692963, + "learning_rate": 3.4583851007476713e-06, + "loss": 0.78927362, + "num_input_tokens_seen": 46827140, + "step": 2185, + "time_per_iteration": 2.5613741874694824 + }, + { + "auxiliary_loss_clip": 0.01165308, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.05647099, + "balance_loss_mlp": 1.02132988, + "epoch": 0.26285095893705285, + "flos": 18327477720960.0, + "grad_norm": 2.0776210310879475, + "language_loss": 0.6891613, + "learning_rate": 3.4578519331660464e-06, + "loss": 0.71112555, + "num_input_tokens_seen": 46844135, + "step": 2186, + "time_per_iteration": 2.7397818565368652 + }, + { + "auxiliary_loss_clip": 0.01190579, + "auxiliary_loss_mlp": 0.01032679, + "balance_loss_clip": 1.06217432, + "balance_loss_mlp": 1.02412593, + "epoch": 0.26297120182769196, + "flos": 20193827466240.0, + "grad_norm": 2.096911355778989, + "language_loss": 0.81944418, + "learning_rate": 3.4573185444294426e-06, + "loss": 0.84167671, + "num_input_tokens_seen": 46862500, + "step": 2187, + "time_per_iteration": 2.6239640712738037 + }, + { + "auxiliary_loss_clip": 0.01175456, + "auxiliary_loss_mlp": 0.00763908, + "balance_loss_clip": 1.05786455, + "balance_loss_mlp": 1.00085139, + "epoch": 0.263091444718331, + "flos": 22418421505920.0, + "grad_norm": 1.7986511251009665, + "language_loss": 0.79012066, + "learning_rate": 3.456784934618774e-06, + "loss": 0.80951428, + "num_input_tokens_seen": 46883665, + "step": 2188, + "time_per_iteration": 2.6499650478363037 + }, + { + "auxiliary_loss_clip": 0.01177985, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.05916893, + "balance_loss_mlp": 1.02302361, + "epoch": 0.2632116876089701, + "flos": 19024827338880.0, + "grad_norm": 1.9286659506983903, + "language_loss": 0.80325353, + "learning_rate": 3.4562511038149897e-06, + "loss": 0.82534832, + "num_input_tokens_seen": 46899160, + "step": 2189, + "time_per_iteration": 2.610973834991455 + }, + { + "auxiliary_loss_clip": 0.0107272, + "auxiliary_loss_mlp": 0.0100414, + "balance_loss_clip": 1.04186225, + "balance_loss_mlp": 1.00206554, + "epoch": 0.26333193049960923, + "flos": 67308054531840.0, + "grad_norm": 0.8596719810699128, + "language_loss": 0.57779741, + "learning_rate": 3.4557170520990705e-06, + "loss": 0.59856594, + "num_input_tokens_seen": 46959835, + "step": 2190, + "time_per_iteration": 3.254857301712036 + }, + { + "auxiliary_loss_clip": 0.01184648, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.05729985, + "balance_loss_mlp": 1.02413034, + "epoch": 0.2634521733902483, + "flos": 25048806468480.0, + "grad_norm": 2.596168456299673, + "language_loss": 0.86207569, + "learning_rate": 3.4551827795520324e-06, + "loss": 0.88425344, + "num_input_tokens_seen": 46982720, + "step": 2191, + "time_per_iteration": 2.6148581504821777 + }, + { + "auxiliary_loss_clip": 0.01192949, + "auxiliary_loss_mlp": 0.010263, + "balance_loss_clip": 1.05866671, + "balance_loss_mlp": 1.01775312, + "epoch": 0.2635724162808874, + "flos": 20594985534720.0, + "grad_norm": 1.6603366512539401, + "language_loss": 0.85139942, + "learning_rate": 3.4546482862549226e-06, + "loss": 0.8735919, + "num_input_tokens_seen": 47003035, + "step": 2192, + "time_per_iteration": 2.573833703994751 + }, + { + "auxiliary_loss_clip": 0.0115699, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.05532324, + "balance_loss_mlp": 1.02886784, + "epoch": 0.2636926591715265, + "flos": 19244636616960.0, + "grad_norm": 2.106359142695926, + "language_loss": 0.78791392, + "learning_rate": 3.4541135722888253e-06, + "loss": 0.80987096, + "num_input_tokens_seen": 47019625, + "step": 2193, + "time_per_iteration": 2.6386377811431885 + }, + { + "auxiliary_loss_clip": 0.01203023, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.05889523, + "balance_loss_mlp": 1.01966405, + "epoch": 0.26381290206216557, + "flos": 28804882734720.0, + "grad_norm": 1.9076585989765682, + "language_loss": 0.80558097, + "learning_rate": 3.453578637734854e-06, + "loss": 0.82790315, + "num_input_tokens_seen": 47040815, + "step": 2194, + "time_per_iteration": 2.6013710498809814 + }, + { + "auxiliary_loss_clip": 0.01209885, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.06535769, + "balance_loss_mlp": 1.02486193, + "epoch": 0.2639331449528047, + "flos": 25008909436800.0, + "grad_norm": 1.6391315992143196, + "language_loss": 0.78201735, + "learning_rate": 3.4530434826741605e-06, + "loss": 0.80445826, + "num_input_tokens_seen": 47061755, + "step": 2195, + "time_per_iteration": 2.553014039993286 + }, + { + "auxiliary_loss_clip": 0.01173584, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.05922675, + "balance_loss_mlp": 1.02456856, + "epoch": 0.26405338784344373, + "flos": 46535775465600.0, + "grad_norm": 2.0830058878795175, + "language_loss": 0.69017345, + "learning_rate": 3.452508107187926e-06, + "loss": 0.71224082, + "num_input_tokens_seen": 47085130, + "step": 2196, + "time_per_iteration": 2.774874687194824 + }, + { + "auxiliary_loss_clip": 0.01130644, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.04896259, + "balance_loss_mlp": 1.01932192, + "epoch": 0.26417363073408284, + "flos": 21179467641600.0, + "grad_norm": 2.514524947463671, + "language_loss": 0.77544224, + "learning_rate": 3.451972511357366e-06, + "loss": 0.79703885, + "num_input_tokens_seen": 47104675, + "step": 2197, + "time_per_iteration": 2.6837775707244873 + }, + { + "auxiliary_loss_clip": 0.01189841, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.06157112, + "balance_loss_mlp": 1.02299118, + "epoch": 0.26429387362472195, + "flos": 22674751937280.0, + "grad_norm": 1.6631402666991062, + "language_loss": 0.85201919, + "learning_rate": 3.45143669526373e-06, + "loss": 0.87422729, + "num_input_tokens_seen": 47124435, + "step": 2198, + "time_per_iteration": 2.550856590270996 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.0100712, + "balance_loss_clip": 1.05072165, + "balance_loss_mlp": 1.00540304, + "epoch": 0.264414116515361, + "flos": 67180534272000.0, + "grad_norm": 0.7883361354841523, + "language_loss": 0.63230938, + "learning_rate": 3.450900658988302e-06, + "loss": 0.65350729, + "num_input_tokens_seen": 47185985, + "step": 2199, + "time_per_iteration": 3.0846142768859863 + }, + { + "auxiliary_loss_clip": 0.01167213, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.05756617, + "balance_loss_mlp": 1.02717495, + "epoch": 0.2645343594060001, + "flos": 25664709997440.0, + "grad_norm": 1.8785795321841914, + "language_loss": 0.77414012, + "learning_rate": 3.450364402612397e-06, + "loss": 0.79617876, + "num_input_tokens_seen": 47203140, + "step": 2200, + "time_per_iteration": 2.60007905960083 + }, + { + "auxiliary_loss_clip": 0.0117046, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.05552101, + "balance_loss_mlp": 1.02308655, + "epoch": 0.26465460229663923, + "flos": 22491822948480.0, + "grad_norm": 1.9171533015382658, + "language_loss": 0.83797586, + "learning_rate": 3.449827926217366e-06, + "loss": 0.86000788, + "num_input_tokens_seen": 47222575, + "step": 2201, + "time_per_iteration": 2.5830206871032715 + }, + { + "auxiliary_loss_clip": 0.01180158, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.05565095, + "balance_loss_mlp": 1.0224961, + "epoch": 0.2647748451872783, + "flos": 29388036038400.0, + "grad_norm": 1.8039305930145464, + "language_loss": 0.80450892, + "learning_rate": 3.449291229884591e-06, + "loss": 0.82662767, + "num_input_tokens_seen": 47243815, + "step": 2202, + "time_per_iteration": 2.6505749225616455 + }, + { + "auxiliary_loss_clip": 0.01167585, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.05624235, + "balance_loss_mlp": 1.02018523, + "epoch": 0.2648950880779174, + "flos": 26797799502720.0, + "grad_norm": 3.3983243971699366, + "language_loss": 0.86773252, + "learning_rate": 3.4487543136954887e-06, + "loss": 0.88970351, + "num_input_tokens_seen": 47263435, + "step": 2203, + "time_per_iteration": 2.6584179401397705 + }, + { + "auxiliary_loss_clip": 0.0116103, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.05634904, + "balance_loss_mlp": 1.02539182, + "epoch": 0.2650153309685565, + "flos": 28841008838400.0, + "grad_norm": 1.659183504880564, + "language_loss": 0.91029167, + "learning_rate": 3.448217177731509e-06, + "loss": 0.93224883, + "num_input_tokens_seen": 47283920, + "step": 2204, + "time_per_iteration": 2.6511173248291016 + }, + { + "auxiliary_loss_clip": 0.01171713, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.05940711, + "balance_loss_mlp": 1.02384424, + "epoch": 0.26513557385919556, + "flos": 20303247271680.0, + "grad_norm": 2.069605312070416, + "language_loss": 0.78080666, + "learning_rate": 3.4476798220741348e-06, + "loss": 0.80284506, + "num_input_tokens_seen": 47302800, + "step": 2205, + "time_per_iteration": 3.443758249282837 + }, + { + "auxiliary_loss_clip": 0.01207371, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.06392574, + "balance_loss_mlp": 1.02402902, + "epoch": 0.26525581674983467, + "flos": 17676274101120.0, + "grad_norm": 1.571396884156926, + "language_loss": 0.78391492, + "learning_rate": 3.4471422468048826e-06, + "loss": 0.80631113, + "num_input_tokens_seen": 47321525, + "step": 2206, + "time_per_iteration": 2.517871856689453 + }, + { + "auxiliary_loss_clip": 0.01183666, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.06074798, + "balance_loss_mlp": 1.0202744, + "epoch": 0.2653760596404738, + "flos": 26833746038400.0, + "grad_norm": 3.022226023368953, + "language_loss": 0.73499399, + "learning_rate": 3.4466044520053022e-06, + "loss": 0.75711918, + "num_input_tokens_seen": 47340530, + "step": 2207, + "time_per_iteration": 2.5882511138916016 + }, + { + "auxiliary_loss_clip": 0.01163886, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.05616009, + "balance_loss_mlp": 1.02327967, + "epoch": 0.26549630253111284, + "flos": 22782160581120.0, + "grad_norm": 1.9279362087523995, + "language_loss": 0.60457009, + "learning_rate": 3.446066437756977e-06, + "loss": 0.62653089, + "num_input_tokens_seen": 47359735, + "step": 2208, + "time_per_iteration": 2.584620714187622 + }, + { + "auxiliary_loss_clip": 0.01174298, + "auxiliary_loss_mlp": 0.01024962, + "balance_loss_clip": 1.05758798, + "balance_loss_mlp": 1.01624823, + "epoch": 0.26561654542175195, + "flos": 23550002640000.0, + "grad_norm": 2.2992960028362135, + "language_loss": 0.75146151, + "learning_rate": 3.4455282041415224e-06, + "loss": 0.77345413, + "num_input_tokens_seen": 47378945, + "step": 2209, + "time_per_iteration": 2.6133382320404053 + }, + { + "auxiliary_loss_clip": 0.01166358, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.0576725, + "balance_loss_mlp": 1.01975298, + "epoch": 0.265736788312391, + "flos": 26906680604160.0, + "grad_norm": 2.506035992973833, + "language_loss": 0.86997288, + "learning_rate": 3.4449897512405894e-06, + "loss": 0.8919214, + "num_input_tokens_seen": 47398095, + "step": 2210, + "time_per_iteration": 4.433354139328003 + }, + { + "auxiliary_loss_clip": 0.01124917, + "auxiliary_loss_mlp": 0.00763939, + "balance_loss_clip": 1.05033374, + "balance_loss_mlp": 1.00095773, + "epoch": 0.2658570312030301, + "flos": 23477139901440.0, + "grad_norm": 2.5691505657724, + "language_loss": 0.7501049, + "learning_rate": 3.444451079135859e-06, + "loss": 0.76899344, + "num_input_tokens_seen": 47417605, + "step": 2211, + "time_per_iteration": 2.7216885089874268 + }, + { + "auxiliary_loss_clip": 0.01135794, + "auxiliary_loss_mlp": 0.00764254, + "balance_loss_clip": 1.05027723, + "balance_loss_mlp": 1.00095475, + "epoch": 0.2659772740936692, + "flos": 21866402315520.0, + "grad_norm": 4.164591014971873, + "language_loss": 0.74083984, + "learning_rate": 3.4439121879090493e-06, + "loss": 0.75984025, + "num_input_tokens_seen": 47435385, + "step": 2212, + "time_per_iteration": 2.6728148460388184 + }, + { + "auxiliary_loss_clip": 0.01180487, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.05928302, + "balance_loss_mlp": 1.02285492, + "epoch": 0.2660975169843083, + "flos": 19793100360960.0, + "grad_norm": 1.8156408081871496, + "language_loss": 0.83416331, + "learning_rate": 3.4433730776419082e-06, + "loss": 0.85628855, + "num_input_tokens_seen": 47454310, + "step": 2213, + "time_per_iteration": 2.5893635749816895 + }, + { + "auxiliary_loss_clip": 0.01194827, + "auxiliary_loss_mlp": 0.00764103, + "balance_loss_clip": 1.05907667, + "balance_loss_mlp": 1.00104117, + "epoch": 0.2662177598749474, + "flos": 29018981750400.0, + "grad_norm": 2.88617253080466, + "language_loss": 0.8063392, + "learning_rate": 3.4428337484162183e-06, + "loss": 0.82592845, + "num_input_tokens_seen": 47475120, + "step": 2214, + "time_per_iteration": 2.6042234897613525 + }, + { + "auxiliary_loss_clip": 0.0117323, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.05683851, + "balance_loss_mlp": 1.02192712, + "epoch": 0.2663380027655865, + "flos": 21762549118080.0, + "grad_norm": 1.8702691314145343, + "language_loss": 0.84157419, + "learning_rate": 3.442294200313797e-06, + "loss": 0.86361468, + "num_input_tokens_seen": 47493150, + "step": 2215, + "time_per_iteration": 2.5609002113342285 + }, + { + "auxiliary_loss_clip": 0.01129059, + "auxiliary_loss_mlp": 0.0100152, + "balance_loss_clip": 1.04827094, + "balance_loss_mlp": 1.00000596, + "epoch": 0.26645824565622556, + "flos": 66980333819520.0, + "grad_norm": 0.766375761484475, + "language_loss": 0.52656794, + "learning_rate": 3.4417544334164916e-06, + "loss": 0.54787374, + "num_input_tokens_seen": 47557295, + "step": 2216, + "time_per_iteration": 3.1085009574890137 + }, + { + "auxiliary_loss_clip": 0.0113574, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.05086994, + "balance_loss_mlp": 1.02155602, + "epoch": 0.26657848854686467, + "flos": 25264198373760.0, + "grad_norm": 1.7237241397883767, + "language_loss": 0.77284902, + "learning_rate": 3.4412144478061854e-06, + "loss": 0.79451668, + "num_input_tokens_seen": 47579705, + "step": 2217, + "time_per_iteration": 2.688756227493286 + }, + { + "auxiliary_loss_clip": 0.01117476, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.0491854, + "balance_loss_mlp": 1.0210824, + "epoch": 0.2666987314375038, + "flos": 23696769611520.0, + "grad_norm": 1.8510502611519608, + "language_loss": 0.75081944, + "learning_rate": 3.4406742435647925e-06, + "loss": 0.77229965, + "num_input_tokens_seen": 47599770, + "step": 2218, + "time_per_iteration": 2.677963972091675 + }, + { + "auxiliary_loss_clip": 0.01187372, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.06072783, + "balance_loss_mlp": 1.02397799, + "epoch": 0.26681897432814283, + "flos": 27048958375680.0, + "grad_norm": 2.0988809630433156, + "language_loss": 0.78762227, + "learning_rate": 3.440133820774263e-06, + "loss": 0.80982268, + "num_input_tokens_seen": 47619580, + "step": 2219, + "time_per_iteration": 2.612597703933716 + }, + { + "auxiliary_loss_clip": 0.01177929, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.05692625, + "balance_loss_mlp": 1.02911866, + "epoch": 0.26693921721878194, + "flos": 28985944216320.0, + "grad_norm": 1.9719463049090495, + "language_loss": 0.8199681, + "learning_rate": 3.439593179516578e-06, + "loss": 0.84213591, + "num_input_tokens_seen": 47639490, + "step": 2220, + "time_per_iteration": 2.597015142440796 + }, + { + "auxiliary_loss_clip": 0.01180935, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.05918884, + "balance_loss_mlp": 1.02271366, + "epoch": 0.26705946010942105, + "flos": 21507834798720.0, + "grad_norm": 1.7801340039728282, + "language_loss": 0.81058663, + "learning_rate": 3.4390523198737524e-06, + "loss": 0.83271623, + "num_input_tokens_seen": 47658650, + "step": 2221, + "time_per_iteration": 2.5597517490386963 + }, + { + "auxiliary_loss_clip": 0.01205964, + "auxiliary_loss_mlp": 0.00764025, + "balance_loss_clip": 1.06115246, + "balance_loss_mlp": 1.00104702, + "epoch": 0.2671797030000601, + "flos": 21471277731840.0, + "grad_norm": 1.5964224359378898, + "language_loss": 0.73500913, + "learning_rate": 3.4385112419278333e-06, + "loss": 0.75470906, + "num_input_tokens_seen": 47679875, + "step": 2222, + "time_per_iteration": 2.51297664642334 + }, + { + "auxiliary_loss_clip": 0.01119354, + "auxiliary_loss_mlp": 0.0100569, + "balance_loss_clip": 1.04763269, + "balance_loss_mlp": 1.00414073, + "epoch": 0.2672999458906992, + "flos": 64189929767040.0, + "grad_norm": 0.7976912376205909, + "language_loss": 0.64793193, + "learning_rate": 3.4379699457609033e-06, + "loss": 0.66918242, + "num_input_tokens_seen": 47737700, + "step": 2223, + "time_per_iteration": 2.986011028289795 + }, + { + "auxiliary_loss_clip": 0.01166102, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.0538398, + "balance_loss_mlp": 1.01759088, + "epoch": 0.26742018878133833, + "flos": 16909042573440.0, + "grad_norm": 1.9473485888576572, + "language_loss": 0.90078104, + "learning_rate": 3.4374284314550755e-06, + "loss": 0.92270875, + "num_input_tokens_seen": 47756740, + "step": 2224, + "time_per_iteration": 2.568167209625244 + }, + { + "auxiliary_loss_clip": 0.01203695, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.06096828, + "balance_loss_mlp": 1.01820922, + "epoch": 0.2675404316719774, + "flos": 20667560964480.0, + "grad_norm": 2.000341276099548, + "language_loss": 0.80752969, + "learning_rate": 3.436886699092498e-06, + "loss": 0.82983488, + "num_input_tokens_seen": 47775255, + "step": 2225, + "time_per_iteration": 2.687025308609009 + }, + { + "auxiliary_loss_clip": 0.01206604, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.02459025, + "epoch": 0.2676606745626165, + "flos": 17485013157120.0, + "grad_norm": 2.7264236370420205, + "language_loss": 0.71757275, + "learning_rate": 3.4363447487553502e-06, + "loss": 0.73997772, + "num_input_tokens_seen": 47788570, + "step": 2226, + "time_per_iteration": 2.472900152206421 + }, + { + "auxiliary_loss_clip": 0.01170698, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.05710268, + "balance_loss_mlp": 1.02229166, + "epoch": 0.26778091745325555, + "flos": 27852675143040.0, + "grad_norm": 1.8891480480563503, + "language_loss": 0.77584475, + "learning_rate": 3.4358025805258455e-06, + "loss": 0.79787207, + "num_input_tokens_seen": 47808275, + "step": 2227, + "time_per_iteration": 2.6246864795684814 + }, + { + "auxiliary_loss_clip": 0.01151102, + "auxiliary_loss_mlp": 0.01024657, + "balance_loss_clip": 1.05271149, + "balance_loss_mlp": 1.01568055, + "epoch": 0.26790116034389466, + "flos": 20955995176320.0, + "grad_norm": 1.6981420800543432, + "language_loss": 0.83317292, + "learning_rate": 3.435260194486232e-06, + "loss": 0.85493058, + "num_input_tokens_seen": 47826245, + "step": 2228, + "time_per_iteration": 2.6458258628845215 + }, + { + "auxiliary_loss_clip": 0.01174902, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.05742013, + "balance_loss_mlp": 1.02044272, + "epoch": 0.2680214032345338, + "flos": 18040659621120.0, + "grad_norm": 3.3687843102912742, + "language_loss": 0.82139403, + "learning_rate": 3.4347175907187875e-06, + "loss": 0.84343982, + "num_input_tokens_seen": 47843235, + "step": 2229, + "time_per_iteration": 2.531506061553955 + }, + { + "auxiliary_loss_clip": 0.01178841, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.05836511, + "balance_loss_mlp": 1.02801943, + "epoch": 0.26814164612517283, + "flos": 22419427086720.0, + "grad_norm": 1.7159714206015537, + "language_loss": 0.879282, + "learning_rate": 3.4341747693058254e-06, + "loss": 0.90143371, + "num_input_tokens_seen": 47861710, + "step": 2230, + "time_per_iteration": 2.546628952026367 + }, + { + "auxiliary_loss_clip": 0.01106508, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.05124855, + "balance_loss_mlp": 1.02104545, + "epoch": 0.26826188901581194, + "flos": 35627371159680.0, + "grad_norm": 1.9615332249089135, + "language_loss": 0.77418643, + "learning_rate": 3.4336317303296916e-06, + "loss": 0.79554534, + "num_input_tokens_seen": 47882685, + "step": 2231, + "time_per_iteration": 3.6975531578063965 + }, + { + "auxiliary_loss_clip": 0.01184785, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.05721736, + "balance_loss_mlp": 1.01787663, + "epoch": 0.26838213190645105, + "flos": 17639788861440.0, + "grad_norm": 2.0245650612079857, + "language_loss": 0.75145757, + "learning_rate": 3.4330884738727635e-06, + "loss": 0.77357101, + "num_input_tokens_seen": 47900860, + "step": 2232, + "time_per_iteration": 2.540956974029541 + }, + { + "auxiliary_loss_clip": 0.01136939, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.05197644, + "balance_loss_mlp": 1.01986146, + "epoch": 0.2685023747970901, + "flos": 22674823764480.0, + "grad_norm": 1.850396606133055, + "language_loss": 0.70930791, + "learning_rate": 3.4325450000174535e-06, + "loss": 0.73096442, + "num_input_tokens_seen": 47917500, + "step": 2233, + "time_per_iteration": 2.6546518802642822 + }, + { + "auxiliary_loss_clip": 0.0113734, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.05256629, + "balance_loss_mlp": 1.02443218, + "epoch": 0.2686226176877292, + "flos": 20120533764480.0, + "grad_norm": 1.7274205688248698, + "language_loss": 0.74145687, + "learning_rate": 3.4320013088462067e-06, + "loss": 0.76317173, + "num_input_tokens_seen": 47934860, + "step": 2234, + "time_per_iteration": 2.616955280303955 + }, + { + "auxiliary_loss_clip": 0.01163611, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.05412567, + "balance_loss_mlp": 1.02103734, + "epoch": 0.2687428605783683, + "flos": 21872040750720.0, + "grad_norm": 1.4676258164373803, + "language_loss": 0.8179189, + "learning_rate": 3.431457400441499e-06, + "loss": 0.83985281, + "num_input_tokens_seen": 47955255, + "step": 2235, + "time_per_iteration": 2.6273481845855713 + }, + { + "auxiliary_loss_clip": 0.01050567, + "auxiliary_loss_mlp": 0.01007846, + "balance_loss_clip": 1.03402674, + "balance_loss_mlp": 1.00623703, + "epoch": 0.2688631034690074, + "flos": 69943320766080.0, + "grad_norm": 0.9279174900054146, + "language_loss": 0.60790086, + "learning_rate": 3.4309132748858424e-06, + "loss": 0.62848496, + "num_input_tokens_seen": 48016245, + "step": 2236, + "time_per_iteration": 4.947690963745117 + }, + { + "auxiliary_loss_clip": 0.01184347, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.0592854, + "balance_loss_mlp": 1.02332115, + "epoch": 0.2689833463596465, + "flos": 22856639431680.0, + "grad_norm": 2.0357490634703126, + "language_loss": 0.83803993, + "learning_rate": 3.430368932261779e-06, + "loss": 0.86020511, + "num_input_tokens_seen": 48036600, + "step": 2237, + "time_per_iteration": 3.3660030364990234 + }, + { + "auxiliary_loss_clip": 0.0117048, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.05655789, + "balance_loss_mlp": 1.02069557, + "epoch": 0.2691035892502856, + "flos": 17200242132480.0, + "grad_norm": 1.9003080193379176, + "language_loss": 0.75151074, + "learning_rate": 3.429824372651886e-06, + "loss": 0.77351379, + "num_input_tokens_seen": 48054750, + "step": 2238, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.01150177, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.05424917, + "balance_loss_mlp": 1.02627242, + "epoch": 0.26922383214092466, + "flos": 17747484814080.0, + "grad_norm": 1.9934701546489355, + "language_loss": 0.8366797, + "learning_rate": 3.4292795961387732e-06, + "loss": 0.85853362, + "num_input_tokens_seen": 48072650, + "step": 2239, + "time_per_iteration": 2.696112871170044 + }, + { + "auxiliary_loss_clip": 0.01202859, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.05973649, + "balance_loss_mlp": 1.02382112, + "epoch": 0.26934407503156377, + "flos": 16173376122240.0, + "grad_norm": 2.1702667712905153, + "language_loss": 0.87873799, + "learning_rate": 3.4287346028050818e-06, + "loss": 0.90109044, + "num_input_tokens_seen": 48088720, + "step": 2240, + "time_per_iteration": 2.4811339378356934 + }, + { + "auxiliary_loss_clip": 0.01169416, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.05507183, + "balance_loss_mlp": 1.01900136, + "epoch": 0.2694643179222028, + "flos": 23732895715200.0, + "grad_norm": 1.7180874399626538, + "language_loss": 0.7955277, + "learning_rate": 3.4281893927334866e-06, + "loss": 0.81749696, + "num_input_tokens_seen": 48108630, + "step": 2241, + "time_per_iteration": 2.637162923812866 + }, + { + "auxiliary_loss_clip": 0.01188866, + "auxiliary_loss_mlp": 0.01028182, + "balance_loss_clip": 1.06009424, + "balance_loss_mlp": 1.01996827, + "epoch": 0.26958456081284193, + "flos": 24718140840960.0, + "grad_norm": 2.030111149495139, + "language_loss": 0.75194442, + "learning_rate": 3.4276439660066963e-06, + "loss": 0.77411485, + "num_input_tokens_seen": 48128330, + "step": 2242, + "time_per_iteration": 2.587635040283203 + }, + { + "auxiliary_loss_clip": 0.01199403, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.05970287, + "balance_loss_mlp": 1.02209735, + "epoch": 0.26970480370348104, + "flos": 18112588606080.0, + "grad_norm": 2.0028487577953835, + "language_loss": 0.84297961, + "learning_rate": 3.427098322707452e-06, + "loss": 0.86528409, + "num_input_tokens_seen": 48144295, + "step": 2243, + "time_per_iteration": 2.4801080226898193 + }, + { + "auxiliary_loss_clip": 0.0118927, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.06410575, + "balance_loss_mlp": 1.02439713, + "epoch": 0.2698250465941201, + "flos": 10816546250880.0, + "grad_norm": 1.9671372307385728, + "language_loss": 0.89197153, + "learning_rate": 3.426552462918526e-06, + "loss": 0.91420358, + "num_input_tokens_seen": 48162230, + "step": 2244, + "time_per_iteration": 2.593916654586792 + }, + { + "auxiliary_loss_clip": 0.01202744, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.06268883, + "balance_loss_mlp": 1.02182698, + "epoch": 0.2699452894847592, + "flos": 17308117653120.0, + "grad_norm": 2.292324163488663, + "language_loss": 0.73412573, + "learning_rate": 3.426006386722726e-06, + "loss": 0.75645196, + "num_input_tokens_seen": 48180290, + "step": 2245, + "time_per_iteration": 2.4957385063171387 + }, + { + "auxiliary_loss_clip": 0.01160077, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.05859447, + "balance_loss_mlp": 1.02848828, + "epoch": 0.2700655323753983, + "flos": 18078150441600.0, + "grad_norm": 1.9090827293427153, + "language_loss": 0.92414212, + "learning_rate": 3.4254600942028914e-06, + "loss": 0.94611514, + "num_input_tokens_seen": 48198165, + "step": 2246, + "time_per_iteration": 2.623569965362549 + }, + { + "auxiliary_loss_clip": 0.01168978, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.05709541, + "balance_loss_mlp": 1.02337432, + "epoch": 0.2701857752660374, + "flos": 18186636493440.0, + "grad_norm": 10.830067433011967, + "language_loss": 0.82037365, + "learning_rate": 3.424913585441893e-06, + "loss": 0.84238148, + "num_input_tokens_seen": 48216000, + "step": 2247, + "time_per_iteration": 2.5540707111358643 + }, + { + "auxiliary_loss_clip": 0.01183803, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.0598104, + "balance_loss_mlp": 1.02165675, + "epoch": 0.2703060181566765, + "flos": 16319496648960.0, + "grad_norm": 1.9830612677353634, + "language_loss": 0.87202406, + "learning_rate": 3.4243668605226374e-06, + "loss": 0.89416611, + "num_input_tokens_seen": 48233025, + "step": 2248, + "time_per_iteration": 2.516472101211548 + }, + { + "auxiliary_loss_clip": 0.01157501, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.05503964, + "balance_loss_mlp": 1.02526188, + "epoch": 0.2704262610473156, + "flos": 19572357329280.0, + "grad_norm": 2.227013617892131, + "language_loss": 0.82716298, + "learning_rate": 3.423819919528061e-06, + "loss": 0.84907687, + "num_input_tokens_seen": 48251110, + "step": 2249, + "time_per_iteration": 2.5871667861938477 + }, + { + "auxiliary_loss_clip": 0.01149035, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.05148458, + "balance_loss_mlp": 1.02018166, + "epoch": 0.27054650393795465, + "flos": 20740746925440.0, + "grad_norm": 1.7484871819685195, + "language_loss": 0.77721608, + "learning_rate": 3.4232727625411355e-06, + "loss": 0.79900265, + "num_input_tokens_seen": 48270215, + "step": 2250, + "time_per_iteration": 2.6701366901397705 + }, + { + "auxiliary_loss_clip": 0.01118308, + "auxiliary_loss_mlp": 0.01024662, + "balance_loss_clip": 1.04822075, + "balance_loss_mlp": 1.0167346, + "epoch": 0.27066674682859376, + "flos": 18658322916480.0, + "grad_norm": 1.7222932146497025, + "language_loss": 0.86081409, + "learning_rate": 3.4227253896448626e-06, + "loss": 0.88224375, + "num_input_tokens_seen": 48288075, + "step": 2251, + "time_per_iteration": 2.643840789794922 + }, + { + "auxiliary_loss_clip": 0.01202671, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.06053793, + "balance_loss_mlp": 1.02314711, + "epoch": 0.2707869897192329, + "flos": 23002759958400.0, + "grad_norm": 3.7543008295539306, + "language_loss": 0.82018793, + "learning_rate": 3.42217780092228e-06, + "loss": 0.84253162, + "num_input_tokens_seen": 48306415, + "step": 2252, + "time_per_iteration": 2.5433032512664795 + }, + { + "auxiliary_loss_clip": 0.01091971, + "auxiliary_loss_mlp": 0.01003335, + "balance_loss_clip": 1.04173684, + "balance_loss_mlp": 1.00177336, + "epoch": 0.27090723260987193, + "flos": 58323240293760.0, + "grad_norm": 0.7879095360478465, + "language_loss": 0.60360694, + "learning_rate": 3.421629996456456e-06, + "loss": 0.62456, + "num_input_tokens_seen": 48365035, + "step": 2253, + "time_per_iteration": 3.0753674507141113 + }, + { + "auxiliary_loss_clip": 0.01185307, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.05666614, + "balance_loss_mlp": 1.02256703, + "epoch": 0.27102747550051104, + "flos": 11984540797440.0, + "grad_norm": 2.0454470776182334, + "language_loss": 0.82128537, + "learning_rate": 3.421081976330491e-06, + "loss": 0.84346157, + "num_input_tokens_seen": 48383550, + "step": 2254, + "time_per_iteration": 2.5213615894317627 + }, + { + "auxiliary_loss_clip": 0.01166618, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.05304694, + "balance_loss_mlp": 1.02470458, + "epoch": 0.27114771839115015, + "flos": 19900401264000.0, + "grad_norm": 1.8520133018526586, + "language_loss": 0.87950951, + "learning_rate": 3.4205337406275207e-06, + "loss": 0.9015137, + "num_input_tokens_seen": 48403670, + "step": 2255, + "time_per_iteration": 2.5624430179595947 + }, + { + "auxiliary_loss_clip": 0.01199932, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.05889618, + "balance_loss_mlp": 1.01900458, + "epoch": 0.2712679612817892, + "flos": 18331966920960.0, + "grad_norm": 2.2961845989349463, + "language_loss": 0.75398445, + "learning_rate": 3.4199852894307114e-06, + "loss": 0.77625811, + "num_input_tokens_seen": 48420420, + "step": 2256, + "time_per_iteration": 2.5139122009277344 + }, + { + "auxiliary_loss_clip": 0.01131478, + "auxiliary_loss_mlp": 0.01033429, + "balance_loss_clip": 1.05274034, + "balance_loss_mlp": 1.02495885, + "epoch": 0.2713882041724283, + "flos": 24460302038400.0, + "grad_norm": 28.465531865149284, + "language_loss": 0.78757352, + "learning_rate": 3.419436622823262e-06, + "loss": 0.80922258, + "num_input_tokens_seen": 48441140, + "step": 2257, + "time_per_iteration": 2.691471815109253 + }, + { + "auxiliary_loss_clip": 0.01172077, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.05842304, + "balance_loss_mlp": 1.02228808, + "epoch": 0.27150844706306737, + "flos": 23039317025280.0, + "grad_norm": 1.7342160261801571, + "language_loss": 0.74526668, + "learning_rate": 3.4188877408884063e-06, + "loss": 0.76729488, + "num_input_tokens_seen": 48461845, + "step": 2258, + "time_per_iteration": 3.4508912563323975 + }, + { + "auxiliary_loss_clip": 0.01170132, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.05688334, + "balance_loss_mlp": 1.0286262, + "epoch": 0.2716286899537065, + "flos": 22563644192640.0, + "grad_norm": 2.5550507144329897, + "language_loss": 0.65720934, + "learning_rate": 3.4183386437094088e-06, + "loss": 0.6792922, + "num_input_tokens_seen": 48478510, + "step": 2259, + "time_per_iteration": 2.5705761909484863 + }, + { + "auxiliary_loss_clip": 0.01173578, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.05497336, + "balance_loss_mlp": 1.01840472, + "epoch": 0.2717489328443456, + "flos": 13115044523520.0, + "grad_norm": 2.746135128200953, + "language_loss": 0.81819636, + "learning_rate": 3.417789331369565e-06, + "loss": 0.84020001, + "num_input_tokens_seen": 48494300, + "step": 2260, + "time_per_iteration": 2.6228742599487305 + }, + { + "auxiliary_loss_clip": 0.01205123, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.06133878, + "balance_loss_mlp": 1.02354169, + "epoch": 0.27186917573498465, + "flos": 29278688060160.0, + "grad_norm": 2.565695001007483, + "language_loss": 0.90917522, + "learning_rate": 3.4172398039522088e-06, + "loss": 0.93155396, + "num_input_tokens_seen": 48515585, + "step": 2261, + "time_per_iteration": 2.5726802349090576 + }, + { + "auxiliary_loss_clip": 0.01186335, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.05688477, + "balance_loss_mlp": 1.01679265, + "epoch": 0.27198941862562376, + "flos": 26032220000640.0, + "grad_norm": 1.7100227764244185, + "language_loss": 0.80055898, + "learning_rate": 3.4166900615407e-06, + "loss": 0.82267702, + "num_input_tokens_seen": 48533500, + "step": 2262, + "time_per_iteration": 2.561304807662964 + }, + { + "auxiliary_loss_clip": 0.01185469, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.05736971, + "balance_loss_mlp": 1.01979208, + "epoch": 0.27210966151626287, + "flos": 32780983760640.0, + "grad_norm": 2.514317314131098, + "language_loss": 0.75424999, + "learning_rate": 3.416140104218436e-06, + "loss": 0.77639091, + "num_input_tokens_seen": 48552865, + "step": 2263, + "time_per_iteration": 5.107882499694824 + }, + { + "auxiliary_loss_clip": 0.01086179, + "auxiliary_loss_mlp": 0.00754045, + "balance_loss_clip": 1.03053486, + "balance_loss_mlp": 1.00092733, + "epoch": 0.2722299044069019, + "flos": 65471043219840.0, + "grad_norm": 0.842769029650522, + "language_loss": 0.69652462, + "learning_rate": 3.4155899320688437e-06, + "loss": 0.71492684, + "num_input_tokens_seen": 48618940, + "step": 2264, + "time_per_iteration": 3.1810214519500732 + }, + { + "auxiliary_loss_clip": 0.0113024, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.05238247, + "balance_loss_mlp": 1.01750028, + "epoch": 0.27235014729754103, + "flos": 15334143782400.0, + "grad_norm": 2.05843568013521, + "language_loss": 0.73922384, + "learning_rate": 3.415039545175384e-06, + "loss": 0.76079756, + "num_input_tokens_seen": 48634665, + "step": 2265, + "time_per_iteration": 2.638352870941162 + }, + { + "auxiliary_loss_clip": 0.01187859, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.05834961, + "balance_loss_mlp": 1.02055252, + "epoch": 0.27247039018818014, + "flos": 21872363973120.0, + "grad_norm": 2.0995664252394306, + "language_loss": 0.65288234, + "learning_rate": 3.414488943621551e-06, + "loss": 0.67505252, + "num_input_tokens_seen": 48653330, + "step": 2266, + "time_per_iteration": 2.574570894241333 + }, + { + "auxiliary_loss_clip": 0.01182684, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.05724061, + "balance_loss_mlp": 1.02276421, + "epoch": 0.2725906330788192, + "flos": 18695490514560.0, + "grad_norm": 1.7483211306663822, + "language_loss": 0.73786974, + "learning_rate": 3.41393812749087e-06, + "loss": 0.76001251, + "num_input_tokens_seen": 48671375, + "step": 2267, + "time_per_iteration": 2.517695903778076 + }, + { + "auxiliary_loss_clip": 0.01168562, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.05664778, + "balance_loss_mlp": 1.02169752, + "epoch": 0.2727108759694583, + "flos": 17886099398400.0, + "grad_norm": 2.3245715058488194, + "language_loss": 0.71956003, + "learning_rate": 3.4133870968668984e-06, + "loss": 0.74155325, + "num_input_tokens_seen": 48686175, + "step": 2268, + "time_per_iteration": 2.5397541522979736 + }, + { + "auxiliary_loss_clip": 0.01174338, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.05736113, + "balance_loss_mlp": 1.02110076, + "epoch": 0.2728311188600974, + "flos": 24461666755200.0, + "grad_norm": 1.6418234099784983, + "language_loss": 0.78672612, + "learning_rate": 3.412835851833229e-06, + "loss": 0.80876768, + "num_input_tokens_seen": 48708370, + "step": 2269, + "time_per_iteration": 2.7194840908050537 + }, + { + "auxiliary_loss_clip": 0.01184135, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.06002295, + "balance_loss_mlp": 1.02146947, + "epoch": 0.2729513617507365, + "flos": 30993314757120.0, + "grad_norm": 1.7233135631284064, + "language_loss": 0.78083861, + "learning_rate": 3.4122843924734834e-06, + "loss": 0.8029846, + "num_input_tokens_seen": 48730670, + "step": 2270, + "time_per_iteration": 2.667440891265869 + }, + { + "auxiliary_loss_clip": 0.01167166, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.05530655, + "balance_loss_mlp": 1.02380538, + "epoch": 0.2730716046413756, + "flos": 19094637421440.0, + "grad_norm": 1.8046767893824014, + "language_loss": 0.87522066, + "learning_rate": 3.411732718871319e-06, + "loss": 0.89722133, + "num_input_tokens_seen": 48746510, + "step": 2271, + "time_per_iteration": 2.5938398838043213 + }, + { + "auxiliary_loss_clip": 0.01198006, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.06020331, + "balance_loss_mlp": 1.02235723, + "epoch": 0.27319184753201464, + "flos": 26944566474240.0, + "grad_norm": 1.6100913658973959, + "language_loss": 0.78764844, + "learning_rate": 3.4111808311104227e-06, + "loss": 0.80993295, + "num_input_tokens_seen": 48768825, + "step": 2272, + "time_per_iteration": 2.6117727756500244 + }, + { + "auxiliary_loss_clip": 0.01176248, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.05379152, + "balance_loss_mlp": 1.02077699, + "epoch": 0.27331209042265375, + "flos": 31759828012800.0, + "grad_norm": 2.075599805796371, + "language_loss": 0.69473076, + "learning_rate": 3.410628729274517e-06, + "loss": 0.71679461, + "num_input_tokens_seen": 48790345, + "step": 2273, + "time_per_iteration": 2.641658306121826 + }, + { + "auxiliary_loss_clip": 0.01165904, + "auxiliary_loss_mlp": 0.00763933, + "balance_loss_clip": 1.05490375, + "balance_loss_mlp": 1.00111079, + "epoch": 0.27343233331329286, + "flos": 25739081107200.0, + "grad_norm": 1.8633737130588401, + "language_loss": 0.8258971, + "learning_rate": 3.4100764134473546e-06, + "loss": 0.84519553, + "num_input_tokens_seen": 48809630, + "step": 2274, + "time_per_iteration": 2.6776325702667236 + }, + { + "auxiliary_loss_clip": 0.01200349, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.06106305, + "balance_loss_mlp": 1.02310181, + "epoch": 0.2735525762039319, + "flos": 24389414547840.0, + "grad_norm": 2.1252814747568247, + "language_loss": 0.84735936, + "learning_rate": 3.4095238837127215e-06, + "loss": 0.86967742, + "num_input_tokens_seen": 48828770, + "step": 2275, + "time_per_iteration": 2.5401053428649902 + }, + { + "auxiliary_loss_clip": 0.01152907, + "auxiliary_loss_mlp": 0.01024967, + "balance_loss_clip": 1.05246913, + "balance_loss_mlp": 1.0162468, + "epoch": 0.27367281909457103, + "flos": 14465357527680.0, + "grad_norm": 1.8405068988020015, + "language_loss": 0.7926175, + "learning_rate": 3.4089711401544355e-06, + "loss": 0.81439626, + "num_input_tokens_seen": 48846365, + "step": 2276, + "time_per_iteration": 2.635746717453003 + }, + { + "auxiliary_loss_clip": 0.01182115, + "auxiliary_loss_mlp": 0.01024031, + "balance_loss_clip": 1.05336463, + "balance_loss_mlp": 1.01560259, + "epoch": 0.27379306198521014, + "flos": 23476996247040.0, + "grad_norm": 2.5395313500270675, + "language_loss": 0.67487431, + "learning_rate": 3.4084181828563486e-06, + "loss": 0.69693577, + "num_input_tokens_seen": 48863085, + "step": 2277, + "time_per_iteration": 2.5924232006073 + }, + { + "auxiliary_loss_clip": 0.01142037, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.05180907, + "balance_loss_mlp": 1.01777411, + "epoch": 0.2739133048758492, + "flos": 17458152762240.0, + "grad_norm": 1.63872546872183, + "language_loss": 0.70435667, + "learning_rate": 3.4078650119023428e-06, + "loss": 0.72604191, + "num_input_tokens_seen": 48881400, + "step": 2278, + "time_per_iteration": 2.6369099617004395 + }, + { + "auxiliary_loss_clip": 0.01129631, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.04789996, + "balance_loss_mlp": 1.02339005, + "epoch": 0.2740335477664883, + "flos": 19273113123840.0, + "grad_norm": 2.3591697923356945, + "language_loss": 0.74473888, + "learning_rate": 3.4073116273763337e-06, + "loss": 0.76636648, + "num_input_tokens_seen": 48895845, + "step": 2279, + "time_per_iteration": 2.6410837173461914 + }, + { + "auxiliary_loss_clip": 0.01175502, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.05449617, + "balance_loss_mlp": 1.02364254, + "epoch": 0.2741537906571274, + "flos": 26104723603200.0, + "grad_norm": 1.7409045269684267, + "language_loss": 0.8121416, + "learning_rate": 3.40675802936227e-06, + "loss": 0.83422792, + "num_input_tokens_seen": 48916630, + "step": 2280, + "time_per_iteration": 2.618314743041992 + }, + { + "auxiliary_loss_clip": 0.01166766, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.0579977, + "balance_loss_mlp": 1.02746391, + "epoch": 0.27427403354776647, + "flos": 34164190644480.0, + "grad_norm": 2.076216025342256, + "language_loss": 0.7152608, + "learning_rate": 3.4062042179441318e-06, + "loss": 0.73730099, + "num_input_tokens_seen": 48937100, + "step": 2281, + "time_per_iteration": 2.716334581375122 + }, + { + "auxiliary_loss_clip": 0.01182867, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.0586555, + "balance_loss_mlp": 1.01770687, + "epoch": 0.2743942764384056, + "flos": 18766988536320.0, + "grad_norm": 2.4691139178697217, + "language_loss": 0.80563724, + "learning_rate": 3.4056501932059314e-06, + "loss": 0.82772565, + "num_input_tokens_seen": 48955175, + "step": 2282, + "time_per_iteration": 2.580759048461914 + }, + { + "auxiliary_loss_clip": 0.01110148, + "auxiliary_loss_mlp": 0.01003243, + "balance_loss_clip": 1.03339624, + "balance_loss_mlp": 1.00190163, + "epoch": 0.2745145193290447, + "flos": 64904048058240.0, + "grad_norm": 0.7666347693249996, + "language_loss": 0.58153772, + "learning_rate": 3.405095955231715e-06, + "loss": 0.60267162, + "num_input_tokens_seen": 49006830, + "step": 2283, + "time_per_iteration": 3.0404679775238037 + }, + { + "auxiliary_loss_clip": 0.01189729, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.05763578, + "balance_loss_mlp": 1.01886785, + "epoch": 0.27463476221968375, + "flos": 16136926796160.0, + "grad_norm": 2.523924182004298, + "language_loss": 0.94218481, + "learning_rate": 3.4045415041055585e-06, + "loss": 0.96436006, + "num_input_tokens_seen": 49022470, + "step": 2284, + "time_per_iteration": 2.5761613845825195 + }, + { + "auxiliary_loss_clip": 0.01176194, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.05835891, + "balance_loss_mlp": 1.02065849, + "epoch": 0.27475500511032286, + "flos": 10376712213120.0, + "grad_norm": 2.1045137554009656, + "language_loss": 0.78368366, + "learning_rate": 3.4039868399115728e-06, + "loss": 0.80574656, + "num_input_tokens_seen": 49037110, + "step": 2285, + "time_per_iteration": 3.416395425796509 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.05695748, + "balance_loss_mlp": 1.02121651, + "epoch": 0.27487524800096197, + "flos": 17311062568320.0, + "grad_norm": 1.7549044070174171, + "language_loss": 0.80335116, + "learning_rate": 3.4034319627339003e-06, + "loss": 0.82501137, + "num_input_tokens_seen": 49053975, + "step": 2286, + "time_per_iteration": 2.652266025543213 + }, + { + "auxiliary_loss_clip": 0.01175356, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.0593164, + "balance_loss_mlp": 1.02606475, + "epoch": 0.274995490891601, + "flos": 27120205002240.0, + "grad_norm": 2.7455388767119713, + "language_loss": 0.69592637, + "learning_rate": 3.402876872656715e-06, + "loss": 0.71803129, + "num_input_tokens_seen": 49072295, + "step": 2287, + "time_per_iteration": 2.626197099685669 + }, + { + "auxiliary_loss_clip": 0.01171219, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.05758548, + "balance_loss_mlp": 1.02590561, + "epoch": 0.27511573378224013, + "flos": 23436093634560.0, + "grad_norm": 7.9369149199794355, + "language_loss": 0.89517951, + "learning_rate": 3.402321569764223e-06, + "loss": 0.91723728, + "num_input_tokens_seen": 49091600, + "step": 2288, + "time_per_iteration": 2.643428325653076 + }, + { + "auxiliary_loss_clip": 0.01150496, + "auxiliary_loss_mlp": 0.00764772, + "balance_loss_clip": 1.05458713, + "balance_loss_mlp": 1.00131893, + "epoch": 0.2752359766728792, + "flos": 16722019434240.0, + "grad_norm": 3.2711832734459536, + "language_loss": 0.83556664, + "learning_rate": 3.4017660541406635e-06, + "loss": 0.8547194, + "num_input_tokens_seen": 49107665, + "step": 2289, + "time_per_iteration": 3.530979871749878 + }, + { + "auxiliary_loss_clip": 0.0118377, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.05886519, + "balance_loss_mlp": 1.01952124, + "epoch": 0.2753562195635183, + "flos": 25297738698240.0, + "grad_norm": 1.7170772357239443, + "language_loss": 0.74407005, + "learning_rate": 3.4012103258703092e-06, + "loss": 0.7661916, + "num_input_tokens_seen": 49126420, + "step": 2290, + "time_per_iteration": 4.260874509811401 + }, + { + "auxiliary_loss_clip": 0.01157504, + "auxiliary_loss_mlp": 0.01023131, + "balance_loss_clip": 1.05454397, + "balance_loss_mlp": 1.01430392, + "epoch": 0.2754764624541574, + "flos": 27338972785920.0, + "grad_norm": 2.0528263521778687, + "language_loss": 0.82945174, + "learning_rate": 3.4006543850374616e-06, + "loss": 0.85125804, + "num_input_tokens_seen": 49141470, + "step": 2291, + "time_per_iteration": 2.617689371109009 + }, + { + "auxiliary_loss_clip": 0.01189104, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.05748844, + "balance_loss_mlp": 1.02490497, + "epoch": 0.27559670534479647, + "flos": 17238379397760.0, + "grad_norm": 2.0068094901254887, + "language_loss": 0.74810296, + "learning_rate": 3.400098231726458e-06, + "loss": 0.77032638, + "num_input_tokens_seen": 49158570, + "step": 2292, + "time_per_iteration": 2.4912991523742676 + }, + { + "auxiliary_loss_clip": 0.01162393, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.0534339, + "balance_loss_mlp": 1.02729678, + "epoch": 0.2757169482354356, + "flos": 21939085486080.0, + "grad_norm": 1.8171196006707633, + "language_loss": 0.87356663, + "learning_rate": 3.3995418660216657e-06, + "loss": 0.89555728, + "num_input_tokens_seen": 49176025, + "step": 2293, + "time_per_iteration": 2.6027116775512695 + }, + { + "auxiliary_loss_clip": 0.01208866, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.06288445, + "balance_loss_mlp": 1.02504635, + "epoch": 0.2758371911260747, + "flos": 20850669521280.0, + "grad_norm": 2.4482372769351284, + "language_loss": 0.80623233, + "learning_rate": 3.3989852880074848e-06, + "loss": 0.82866138, + "num_input_tokens_seen": 49197455, + "step": 2294, + "time_per_iteration": 2.487154245376587 + }, + { + "auxiliary_loss_clip": 0.01089437, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.0398109, + "balance_loss_mlp": 1.00113487, + "epoch": 0.27595743401671374, + "flos": 69269063592960.0, + "grad_norm": 0.7411011988427074, + "language_loss": 0.60670304, + "learning_rate": 3.398428497768348e-06, + "loss": 0.62763035, + "num_input_tokens_seen": 49262625, + "step": 2295, + "time_per_iteration": 3.242403268814087 + }, + { + "auxiliary_loss_clip": 0.01168106, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.05554807, + "balance_loss_mlp": 1.01948631, + "epoch": 0.27607767690735285, + "flos": 21215019127680.0, + "grad_norm": 1.7996103019163994, + "language_loss": 0.71940064, + "learning_rate": 3.3978714953887205e-06, + "loss": 0.74136877, + "num_input_tokens_seen": 49282380, + "step": 2296, + "time_per_iteration": 2.6048669815063477 + }, + { + "auxiliary_loss_clip": 0.01131485, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.04829657, + "balance_loss_mlp": 1.0204345, + "epoch": 0.27619791979799196, + "flos": 24825334003200.0, + "grad_norm": 1.9130129589227036, + "language_loss": 0.85990274, + "learning_rate": 3.397314280953098e-06, + "loss": 0.88151181, + "num_input_tokens_seen": 49303205, + "step": 2297, + "time_per_iteration": 2.638890027999878 + }, + { + "auxiliary_loss_clip": 0.01164224, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.05412889, + "balance_loss_mlp": 1.01782489, + "epoch": 0.276318162688631, + "flos": 24753548672640.0, + "grad_norm": 3.319790641089275, + "language_loss": 0.8011657, + "learning_rate": 3.3967568545460108e-06, + "loss": 0.8230716, + "num_input_tokens_seen": 49322745, + "step": 2298, + "time_per_iteration": 2.556432008743286 + }, + { + "auxiliary_loss_clip": 0.01187012, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.06077337, + "balance_loss_mlp": 1.02340794, + "epoch": 0.27643840557927013, + "flos": 18150007599360.0, + "grad_norm": 2.2343643399360884, + "language_loss": 0.80108303, + "learning_rate": 3.3961992162520185e-06, + "loss": 0.82327819, + "num_input_tokens_seen": 49341370, + "step": 2299, + "time_per_iteration": 2.5123610496520996 + }, + { + "auxiliary_loss_clip": 0.01188966, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.05956244, + "balance_loss_mlp": 1.02217984, + "epoch": 0.27655864846990924, + "flos": 24823933372800.0, + "grad_norm": 2.319248446740071, + "language_loss": 0.71698606, + "learning_rate": 3.3956413661557156e-06, + "loss": 0.73919129, + "num_input_tokens_seen": 49361545, + "step": 2300, + "time_per_iteration": 2.5811564922332764 + }, + { + "auxiliary_loss_clip": 0.01165889, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.05479372, + "balance_loss_mlp": 1.02438974, + "epoch": 0.2766788913605483, + "flos": 20266582464000.0, + "grad_norm": 2.245048520138294, + "language_loss": 0.66343451, + "learning_rate": 3.3950833043417273e-06, + "loss": 0.68543071, + "num_input_tokens_seen": 49379690, + "step": 2301, + "time_per_iteration": 2.6058032512664795 + }, + { + "auxiliary_loss_clip": 0.01192258, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.06261659, + "balance_loss_mlp": 1.0188148, + "epoch": 0.2767991342511874, + "flos": 21470272151040.0, + "grad_norm": 2.1459483031713105, + "language_loss": 0.72972083, + "learning_rate": 3.3945250308947105e-06, + "loss": 0.75193042, + "num_input_tokens_seen": 49395995, + "step": 2302, + "time_per_iteration": 2.5117101669311523 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01010573, + "balance_loss_clip": 1.03200841, + "balance_loss_mlp": 1.00903499, + "epoch": 0.2769193771418265, + "flos": 66002627571840.0, + "grad_norm": 1.2516432673174016, + "language_loss": 0.68342334, + "learning_rate": 3.3939665458993556e-06, + "loss": 0.70453525, + "num_input_tokens_seen": 49450415, + "step": 2303, + "time_per_iteration": 3.0422446727752686 + }, + { + "auxiliary_loss_clip": 0.01163331, + "auxiliary_loss_mlp": 0.00764806, + "balance_loss_clip": 1.05333459, + "balance_loss_mlp": 1.00124931, + "epoch": 0.27703962003246557, + "flos": 20704441253760.0, + "grad_norm": 1.8284182175453665, + "language_loss": 0.76726037, + "learning_rate": 3.3934078494403843e-06, + "loss": 0.7865417, + "num_input_tokens_seen": 49469990, + "step": 2304, + "time_per_iteration": 2.6103415489196777 + }, + { + "auxiliary_loss_clip": 0.01109891, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.04834855, + "balance_loss_mlp": 1.03272331, + "epoch": 0.2771598629231047, + "flos": 22929897219840.0, + "grad_norm": 2.4626805714158357, + "language_loss": 0.81398809, + "learning_rate": 3.3928489416025495e-06, + "loss": 0.83551908, + "num_input_tokens_seen": 49490835, + "step": 2305, + "time_per_iteration": 2.688297748565674 + }, + { + "auxiliary_loss_clip": 0.01171536, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.05652678, + "balance_loss_mlp": 1.03034282, + "epoch": 0.27728010581374374, + "flos": 18369457741440.0, + "grad_norm": 2.490454316898515, + "language_loss": 0.78673679, + "learning_rate": 3.392289822470638e-06, + "loss": 0.80885625, + "num_input_tokens_seen": 49508815, + "step": 2306, + "time_per_iteration": 2.5390889644622803 + }, + { + "auxiliary_loss_clip": 0.01169412, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.05469537, + "balance_loss_mlp": 1.01853752, + "epoch": 0.27740034870438285, + "flos": 19427637432960.0, + "grad_norm": 1.9863388544497083, + "language_loss": 0.75672752, + "learning_rate": 3.3917304921294674e-06, + "loss": 0.77870184, + "num_input_tokens_seen": 49526980, + "step": 2307, + "time_per_iteration": 2.6404366493225098 + }, + { + "auxiliary_loss_clip": 0.0118897, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.05825448, + "balance_loss_mlp": 1.02432537, + "epoch": 0.27752059159502196, + "flos": 21614776565760.0, + "grad_norm": 1.6643399870341387, + "language_loss": 0.8058055, + "learning_rate": 3.3911709506638876e-06, + "loss": 0.82803428, + "num_input_tokens_seen": 49546290, + "step": 2308, + "time_per_iteration": 2.532336711883545 + }, + { + "auxiliary_loss_clip": 0.01147685, + "auxiliary_loss_mlp": 0.00764965, + "balance_loss_clip": 1.0495584, + "balance_loss_mlp": 1.00126362, + "epoch": 0.277640834485661, + "flos": 26608011016320.0, + "grad_norm": 2.2848239249049964, + "language_loss": 0.81060874, + "learning_rate": 3.390611198158781e-06, + "loss": 0.82973528, + "num_input_tokens_seen": 49564165, + "step": 2309, + "time_per_iteration": 2.6282243728637695 + }, + { + "auxiliary_loss_clip": 0.01207549, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.06244457, + "balance_loss_mlp": 1.02489567, + "epoch": 0.2777610773763001, + "flos": 19492814661120.0, + "grad_norm": 2.154266483990461, + "language_loss": 0.89846921, + "learning_rate": 3.3900512346990612e-06, + "loss": 0.92088485, + "num_input_tokens_seen": 49580155, + "step": 2310, + "time_per_iteration": 2.4862213134765625 + }, + { + "auxiliary_loss_clip": 0.01145768, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.04983544, + "balance_loss_mlp": 1.02585864, + "epoch": 0.27788132026693924, + "flos": 38290650001920.0, + "grad_norm": 1.9653351240014445, + "language_loss": 0.65747881, + "learning_rate": 3.389491060369674e-06, + "loss": 0.67929792, + "num_input_tokens_seen": 49605830, + "step": 2311, + "time_per_iteration": 3.580845832824707 + }, + { + "auxiliary_loss_clip": 0.01135299, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.05042803, + "balance_loss_mlp": 1.01775265, + "epoch": 0.2780015631575783, + "flos": 22382546797440.0, + "grad_norm": 1.975646644909417, + "language_loss": 0.8890394, + "learning_rate": 3.388930675255598e-06, + "loss": 0.91066039, + "num_input_tokens_seen": 49625680, + "step": 2312, + "time_per_iteration": 2.6129791736602783 + }, + { + "auxiliary_loss_clip": 0.01180119, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.05840492, + "balance_loss_mlp": 1.02352226, + "epoch": 0.2781218060482174, + "flos": 12203200840320.0, + "grad_norm": 2.493920053750425, + "language_loss": 0.79525542, + "learning_rate": 3.388370079441843e-06, + "loss": 0.81739551, + "num_input_tokens_seen": 49641195, + "step": 2313, + "time_per_iteration": 2.551487684249878 + }, + { + "auxiliary_loss_clip": 0.01162172, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.05983114, + "balance_loss_mlp": 1.02633989, + "epoch": 0.2782420489388565, + "flos": 18107632529280.0, + "grad_norm": 2.2083455780340984, + "language_loss": 0.92839342, + "learning_rate": 3.3878092730134505e-06, + "loss": 0.95036948, + "num_input_tokens_seen": 49659180, + "step": 2314, + "time_per_iteration": 2.5407063961029053 + }, + { + "auxiliary_loss_clip": 0.01180894, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.05730247, + "balance_loss_mlp": 1.02655625, + "epoch": 0.27836229182949557, + "flos": 18514752255360.0, + "grad_norm": 2.309529374740936, + "language_loss": 0.80695623, + "learning_rate": 3.3872482560554947e-06, + "loss": 0.82912433, + "num_input_tokens_seen": 49677955, + "step": 2315, + "time_per_iteration": 2.4814672470092773 + }, + { + "auxiliary_loss_clip": 0.0109702, + "auxiliary_loss_mlp": 0.01001625, + "balance_loss_clip": 1.02963758, + "balance_loss_mlp": 1.0001471, + "epoch": 0.2784825347201347, + "flos": 67079230940160.0, + "grad_norm": 0.7948957031337394, + "language_loss": 0.57035553, + "learning_rate": 3.386687028653082e-06, + "loss": 0.59134197, + "num_input_tokens_seen": 49740800, + "step": 2316, + "time_per_iteration": 4.028663396835327 + }, + { + "auxiliary_loss_clip": 0.01146956, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.05576205, + "balance_loss_mlp": 1.02076316, + "epoch": 0.2786027776107738, + "flos": 22631119891200.0, + "grad_norm": 1.9196492220412786, + "language_loss": 0.84998566, + "learning_rate": 3.386125590891349e-06, + "loss": 0.8717587, + "num_input_tokens_seen": 49757675, + "step": 2317, + "time_per_iteration": 3.359004259109497 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01028775, + "balance_loss_clip": 1.05310535, + "balance_loss_mlp": 1.01991177, + "epoch": 0.27872302050141284, + "flos": 15778826156160.0, + "grad_norm": 2.513360774218176, + "language_loss": 0.82911432, + "learning_rate": 3.3855639428554657e-06, + "loss": 0.85101986, + "num_input_tokens_seen": 49775205, + "step": 2318, + "time_per_iteration": 2.5261754989624023 + }, + { + "auxiliary_loss_clip": 0.01148919, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.05549622, + "balance_loss_mlp": 1.01964068, + "epoch": 0.27884326339205195, + "flos": 22126970551680.0, + "grad_norm": 1.8332929501310893, + "language_loss": 0.80668819, + "learning_rate": 3.385002084630635e-06, + "loss": 0.82846177, + "num_input_tokens_seen": 49794175, + "step": 2319, + "time_per_iteration": 2.569084405899048 + }, + { + "auxiliary_loss_clip": 0.01196523, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.06220078, + "balance_loss_mlp": 1.02451563, + "epoch": 0.278963506282691, + "flos": 20558715776640.0, + "grad_norm": 2.149297710017262, + "language_loss": 0.85028422, + "learning_rate": 3.384440016302088e-06, + "loss": 0.8725965, + "num_input_tokens_seen": 49812850, + "step": 2320, + "time_per_iteration": 2.513162136077881 + }, + { + "auxiliary_loss_clip": 0.01185125, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.05859601, + "balance_loss_mlp": 1.0251838, + "epoch": 0.2790837491733301, + "flos": 21942928241280.0, + "grad_norm": 2.5064229707667804, + "language_loss": 0.62424409, + "learning_rate": 3.3838777379550923e-06, + "loss": 0.64644217, + "num_input_tokens_seen": 49832295, + "step": 2321, + "time_per_iteration": 2.5065596103668213 + }, + { + "auxiliary_loss_clip": 0.01179955, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.05970931, + "balance_loss_mlp": 1.02632487, + "epoch": 0.27920399206396923, + "flos": 26286790665600.0, + "grad_norm": 2.065703189611674, + "language_loss": 0.78120005, + "learning_rate": 3.383315249674944e-06, + "loss": 0.80335611, + "num_input_tokens_seen": 49850860, + "step": 2322, + "time_per_iteration": 2.5916476249694824 + }, + { + "auxiliary_loss_clip": 0.01162897, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.05666399, + "balance_loss_mlp": 1.0240953, + "epoch": 0.2793242349546083, + "flos": 25400981364480.0, + "grad_norm": 2.454702743351978, + "language_loss": 0.86042655, + "learning_rate": 3.3827525515469715e-06, + "loss": 0.88238835, + "num_input_tokens_seen": 49865765, + "step": 2323, + "time_per_iteration": 2.6412618160247803 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.05057168, + "balance_loss_mlp": 1.03035247, + "epoch": 0.2794444778452474, + "flos": 20850346298880.0, + "grad_norm": 2.6376881586176335, + "language_loss": 0.71126264, + "learning_rate": 3.3821896436565367e-06, + "loss": 0.73316348, + "num_input_tokens_seen": 49885425, + "step": 2324, + "time_per_iteration": 2.594083547592163 + }, + { + "auxiliary_loss_clip": 0.01193476, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.06444263, + "balance_loss_mlp": 1.0272342, + "epoch": 0.2795647207358865, + "flos": 21576244250880.0, + "grad_norm": 1.7008364518968293, + "language_loss": 0.7003355, + "learning_rate": 3.381626526089032e-06, + "loss": 0.72263604, + "num_input_tokens_seen": 49904990, + "step": 2325, + "time_per_iteration": 2.5412697792053223 + }, + { + "auxiliary_loss_clip": 0.01172224, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.0559392, + "balance_loss_mlp": 1.020064, + "epoch": 0.27968496362652556, + "flos": 21471744608640.0, + "grad_norm": 1.9893451231395027, + "language_loss": 0.78937197, + "learning_rate": 3.3810631989298815e-06, + "loss": 0.81139183, + "num_input_tokens_seen": 49924600, + "step": 2326, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.01156335, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.06092358, + "balance_loss_mlp": 1.02234364, + "epoch": 0.2798052065171647, + "flos": 23258695340160.0, + "grad_norm": 2.3114567806177178, + "language_loss": 0.84328163, + "learning_rate": 3.3804996622645423e-06, + "loss": 0.86517608, + "num_input_tokens_seen": 49942600, + "step": 2327, + "time_per_iteration": 2.665694236755371 + }, + { + "auxiliary_loss_clip": 0.01205958, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.06268311, + "balance_loss_mlp": 1.02151835, + "epoch": 0.2799254494078038, + "flos": 21539328048000.0, + "grad_norm": 2.898145915437923, + "language_loss": 0.89373565, + "learning_rate": 3.3799359161785015e-06, + "loss": 0.91610086, + "num_input_tokens_seen": 49962250, + "step": 2328, + "time_per_iteration": 2.510000467300415 + }, + { + "auxiliary_loss_clip": 0.0118616, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.05907881, + "balance_loss_mlp": 1.02512813, + "epoch": 0.28004569229844284, + "flos": 26393912000640.0, + "grad_norm": 1.6260675827277233, + "language_loss": 0.85655361, + "learning_rate": 3.3793719607572798e-06, + "loss": 0.8787632, + "num_input_tokens_seen": 49983215, + "step": 2329, + "time_per_iteration": 2.5816659927368164 + }, + { + "auxiliary_loss_clip": 0.01157664, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.05366766, + "balance_loss_mlp": 1.02076852, + "epoch": 0.28016593518908195, + "flos": 33547676584320.0, + "grad_norm": 2.100938306879155, + "language_loss": 0.77496648, + "learning_rate": 3.378807796086428e-06, + "loss": 0.79684114, + "num_input_tokens_seen": 50006075, + "step": 2330, + "time_per_iteration": 2.6494498252868652 + }, + { + "auxiliary_loss_clip": 0.01209438, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.06612802, + "balance_loss_mlp": 1.02177012, + "epoch": 0.28028617807972106, + "flos": 15340823712000.0, + "grad_norm": 1.9293112768998422, + "language_loss": 0.77006054, + "learning_rate": 3.37824342225153e-06, + "loss": 0.79247063, + "num_input_tokens_seen": 50022495, + "step": 2331, + "time_per_iteration": 2.436617136001587 + }, + { + "auxiliary_loss_clip": 0.01148122, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.05807436, + "balance_loss_mlp": 1.02456784, + "epoch": 0.2804064209703601, + "flos": 25520277409920.0, + "grad_norm": 1.7922196358332234, + "language_loss": 0.77682823, + "learning_rate": 3.3776788393382006e-06, + "loss": 0.798648, + "num_input_tokens_seen": 50041975, + "step": 2332, + "time_per_iteration": 2.6401240825653076 + }, + { + "auxiliary_loss_clip": 0.01206985, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.06424546, + "balance_loss_mlp": 1.0206244, + "epoch": 0.2805266638609992, + "flos": 29351766280320.0, + "grad_norm": 2.061127767207515, + "language_loss": 0.76400775, + "learning_rate": 3.3771140474320872e-06, + "loss": 0.78637791, + "num_input_tokens_seen": 50061925, + "step": 2333, + "time_per_iteration": 2.5431337356567383 + }, + { + "auxiliary_loss_clip": 0.01169427, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.0593071, + "balance_loss_mlp": 1.02760649, + "epoch": 0.28064690675163834, + "flos": 21463735875840.0, + "grad_norm": 2.161381305127918, + "language_loss": 0.79593223, + "learning_rate": 3.3765490466188664e-06, + "loss": 0.81799334, + "num_input_tokens_seen": 50079325, + "step": 2334, + "time_per_iteration": 2.5762736797332764 + }, + { + "auxiliary_loss_clip": 0.01159924, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.05718207, + "balance_loss_mlp": 1.02011597, + "epoch": 0.2807671496422774, + "flos": 20995640812800.0, + "grad_norm": 2.473230499343083, + "language_loss": 0.7369501, + "learning_rate": 3.3759838369842508e-06, + "loss": 0.75884885, + "num_input_tokens_seen": 50097400, + "step": 2335, + "time_per_iteration": 2.566033363342285 + }, + { + "auxiliary_loss_clip": 0.0116503, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.06084752, + "balance_loss_mlp": 1.02295387, + "epoch": 0.2808873925329165, + "flos": 21506577822720.0, + "grad_norm": 2.2176781950601927, + "language_loss": 0.72732151, + "learning_rate": 3.375418418613981e-06, + "loss": 0.749291, + "num_input_tokens_seen": 50116425, + "step": 2336, + "time_per_iteration": 2.559135675430298 + }, + { + "auxiliary_loss_clip": 0.01176915, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.06053233, + "balance_loss_mlp": 1.02447891, + "epoch": 0.28100763542355556, + "flos": 16070815814400.0, + "grad_norm": 3.5260090584103305, + "language_loss": 0.83013093, + "learning_rate": 3.374852791593831e-06, + "loss": 0.85224783, + "num_input_tokens_seen": 50132625, + "step": 2337, + "time_per_iteration": 3.3774821758270264 + }, + { + "auxiliary_loss_clip": 0.01156944, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.05524826, + "balance_loss_mlp": 1.02388132, + "epoch": 0.28112787831419467, + "flos": 19062605468160.0, + "grad_norm": 3.0706785236155483, + "language_loss": 0.53758192, + "learning_rate": 3.374286956009605e-06, + "loss": 0.55948687, + "num_input_tokens_seen": 50151190, + "step": 2338, + "time_per_iteration": 2.5810139179229736 + }, + { + "auxiliary_loss_clip": 0.01192565, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.06559896, + "balance_loss_mlp": 1.02130938, + "epoch": 0.2812481212048338, + "flos": 12823629482880.0, + "grad_norm": 2.024522059228888, + "language_loss": 0.75487411, + "learning_rate": 3.3737209119471405e-06, + "loss": 0.77710736, + "num_input_tokens_seen": 50167700, + "step": 2339, + "time_per_iteration": 2.5333142280578613 + }, + { + "auxiliary_loss_clip": 0.01198482, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.06378555, + "balance_loss_mlp": 1.02078724, + "epoch": 0.28136836409547283, + "flos": 15633064765440.0, + "grad_norm": 3.3468805786252953, + "language_loss": 0.63466692, + "learning_rate": 3.373154659492306e-06, + "loss": 0.65695763, + "num_input_tokens_seen": 50185840, + "step": 2340, + "time_per_iteration": 2.509500741958618 + }, + { + "auxiliary_loss_clip": 0.01178868, + "auxiliary_loss_mlp": 0.01045446, + "balance_loss_clip": 1.060274, + "balance_loss_mlp": 1.03621376, + "epoch": 0.28148860698611194, + "flos": 19933726106880.0, + "grad_norm": 1.8242314159819109, + "language_loss": 0.85186571, + "learning_rate": 3.3725881987310016e-06, + "loss": 0.87410885, + "num_input_tokens_seen": 50203375, + "step": 2341, + "time_per_iteration": 2.53665828704834 + }, + { + "auxiliary_loss_clip": 0.01173563, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.05899489, + "balance_loss_mlp": 1.02503204, + "epoch": 0.28160884987675106, + "flos": 17457219008640.0, + "grad_norm": 1.958764281288809, + "language_loss": 0.87675655, + "learning_rate": 3.372021529749159e-06, + "loss": 0.89882898, + "num_input_tokens_seen": 50222435, + "step": 2342, + "time_per_iteration": 3.409428358078003 + }, + { + "auxiliary_loss_clip": 0.01133496, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.05624151, + "balance_loss_mlp": 1.02449799, + "epoch": 0.2817290927673901, + "flos": 16834743290880.0, + "grad_norm": 1.856778383664567, + "language_loss": 0.92201102, + "learning_rate": 3.3714546526327405e-06, + "loss": 0.94367945, + "num_input_tokens_seen": 50240435, + "step": 2343, + "time_per_iteration": 3.4450154304504395 + }, + { + "auxiliary_loss_clip": 0.01166647, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.05773675, + "balance_loss_mlp": 1.02154708, + "epoch": 0.2818493356580292, + "flos": 15414081500160.0, + "grad_norm": 1.9954686708102507, + "language_loss": 0.87517333, + "learning_rate": 3.3708875674677423e-06, + "loss": 0.89715403, + "num_input_tokens_seen": 50258410, + "step": 2344, + "time_per_iteration": 3.3381736278533936 + }, + { + "auxiliary_loss_clip": 0.01186885, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.06462586, + "balance_loss_mlp": 1.02197528, + "epoch": 0.28196957854866833, + "flos": 20412451595520.0, + "grad_norm": 1.9092639907027233, + "language_loss": 0.83694327, + "learning_rate": 3.37032027434019e-06, + "loss": 0.85912991, + "num_input_tokens_seen": 50277930, + "step": 2345, + "time_per_iteration": 2.535391330718994 + }, + { + "auxiliary_loss_clip": 0.01202432, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.06387603, + "balance_loss_mlp": 1.02106702, + "epoch": 0.2820898214393074, + "flos": 19973120348160.0, + "grad_norm": 5.171048350433416, + "language_loss": 0.82612407, + "learning_rate": 3.369752773336141e-06, + "loss": 0.84846711, + "num_input_tokens_seen": 50297410, + "step": 2346, + "time_per_iteration": 2.5055010318756104 + }, + { + "auxiliary_loss_clip": 0.01173886, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.05696893, + "balance_loss_mlp": 1.02402127, + "epoch": 0.2822100643299465, + "flos": 22528308188160.0, + "grad_norm": 1.6579476078028912, + "language_loss": 0.77877158, + "learning_rate": 3.3691850645416864e-06, + "loss": 0.80085343, + "num_input_tokens_seen": 50317120, + "step": 2347, + "time_per_iteration": 2.5489501953125 + }, + { + "auxiliary_loss_clip": 0.01197893, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.06306696, + "balance_loss_mlp": 1.02986479, + "epoch": 0.2823303072205856, + "flos": 11546682007680.0, + "grad_norm": 1.882846620029653, + "language_loss": 0.8282944, + "learning_rate": 3.368617148042945e-06, + "loss": 0.85067034, + "num_input_tokens_seen": 50334790, + "step": 2348, + "time_per_iteration": 2.505981683731079 + }, + { + "auxiliary_loss_clip": 0.01170172, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.05508637, + "balance_loss_mlp": 1.0265466, + "epoch": 0.28245055011122466, + "flos": 18259894281600.0, + "grad_norm": 1.8691618871655928, + "language_loss": 0.8425113, + "learning_rate": 3.368049023926071e-06, + "loss": 0.86458147, + "num_input_tokens_seen": 50353785, + "step": 2349, + "time_per_iteration": 2.526679039001465 + }, + { + "auxiliary_loss_clip": 0.011914, + "auxiliary_loss_mlp": 0.0103685, + "balance_loss_clip": 1.06355131, + "balance_loss_mlp": 1.0282495, + "epoch": 0.2825707930018638, + "flos": 24608110504320.0, + "grad_norm": 1.595431489534568, + "language_loss": 0.83579248, + "learning_rate": 3.3674806922772476e-06, + "loss": 0.85807496, + "num_input_tokens_seen": 50374670, + "step": 2350, + "time_per_iteration": 2.624443531036377 + }, + { + "auxiliary_loss_clip": 0.01170053, + "auxiliary_loss_mlp": 0.01040415, + "balance_loss_clip": 1.05922985, + "balance_loss_mlp": 1.03089666, + "epoch": 0.28269103589250283, + "flos": 25226994862080.0, + "grad_norm": 2.136161510632417, + "language_loss": 0.74909902, + "learning_rate": 3.3669121531826904e-06, + "loss": 0.77120376, + "num_input_tokens_seen": 50395650, + "step": 2351, + "time_per_iteration": 2.659956216812134 + }, + { + "auxiliary_loss_clip": 0.01160098, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.06141984, + "balance_loss_mlp": 1.02030897, + "epoch": 0.28281127878314194, + "flos": 19281552819840.0, + "grad_norm": 2.173373283419757, + "language_loss": 0.83295619, + "learning_rate": 3.366343406728647e-06, + "loss": 0.85484803, + "num_input_tokens_seen": 50415100, + "step": 2352, + "time_per_iteration": 2.5878121852874756 + }, + { + "auxiliary_loss_clip": 0.01183629, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.05737889, + "balance_loss_mlp": 1.02094913, + "epoch": 0.28293152167378105, + "flos": 23878405710720.0, + "grad_norm": 1.707425677985109, + "language_loss": 0.68638068, + "learning_rate": 3.3657744530013946e-06, + "loss": 0.70851934, + "num_input_tokens_seen": 50434335, + "step": 2353, + "time_per_iteration": 2.5580530166625977 + }, + { + "auxiliary_loss_clip": 0.01195874, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.06339359, + "balance_loss_mlp": 1.02051306, + "epoch": 0.2830517645644201, + "flos": 43866965928960.0, + "grad_norm": 2.0762422535772638, + "language_loss": 0.71071017, + "learning_rate": 3.3652052920872437e-06, + "loss": 0.73297, + "num_input_tokens_seen": 50457200, + "step": 2354, + "time_per_iteration": 2.7134580612182617 + }, + { + "auxiliary_loss_clip": 0.01179273, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.05931056, + "balance_loss_mlp": 1.02764273, + "epoch": 0.2831720074550592, + "flos": 26651750803200.0, + "grad_norm": 1.8624680207898485, + "language_loss": 0.85815382, + "learning_rate": 3.3646359240725355e-06, + "loss": 0.8803221, + "num_input_tokens_seen": 50476390, + "step": 2355, + "time_per_iteration": 2.614729642868042 + }, + { + "auxiliary_loss_clip": 0.01185177, + "auxiliary_loss_mlp": 0.00764724, + "balance_loss_clip": 1.06031001, + "balance_loss_mlp": 1.00082004, + "epoch": 0.2832922503456983, + "flos": 31029979564800.0, + "grad_norm": 2.694359359149424, + "language_loss": 0.67484367, + "learning_rate": 3.364066349043643e-06, + "loss": 0.69434267, + "num_input_tokens_seen": 50497595, + "step": 2356, + "time_per_iteration": 2.6167221069335938 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.06018806, + "balance_loss_mlp": 1.02372348, + "epoch": 0.2834124932363374, + "flos": 20405699838720.0, + "grad_norm": 1.9004247575826367, + "language_loss": 0.82089382, + "learning_rate": 3.363496567086969e-06, + "loss": 0.84297258, + "num_input_tokens_seen": 50514690, + "step": 2357, + "time_per_iteration": 2.53696870803833 + }, + { + "auxiliary_loss_clip": 0.01206444, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.06446064, + "balance_loss_mlp": 1.02153432, + "epoch": 0.2835327361269765, + "flos": 39384848056320.0, + "grad_norm": 1.9190401425507195, + "language_loss": 0.75401032, + "learning_rate": 3.3629265782889506e-06, + "loss": 0.77638113, + "num_input_tokens_seen": 50536515, + "step": 2358, + "time_per_iteration": 2.6603875160217285 + }, + { + "auxiliary_loss_clip": 0.01157554, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.055071, + "balance_loss_mlp": 1.02403402, + "epoch": 0.2836529790176156, + "flos": 30261598801920.0, + "grad_norm": 2.042476570994474, + "language_loss": 0.72166002, + "learning_rate": 3.362356382736054e-06, + "loss": 0.74357271, + "num_input_tokens_seen": 50557120, + "step": 2359, + "time_per_iteration": 2.6420087814331055 + }, + { + "auxiliary_loss_clip": 0.0116007, + "auxiliary_loss_mlp": 0.01023801, + "balance_loss_clip": 1.05396056, + "balance_loss_mlp": 1.01566529, + "epoch": 0.28377322190825466, + "flos": 12677796264960.0, + "grad_norm": 1.9568675531477338, + "language_loss": 0.90907407, + "learning_rate": 3.361785980514777e-06, + "loss": 0.93091279, + "num_input_tokens_seen": 50573320, + "step": 2360, + "time_per_iteration": 2.6443564891815186 + }, + { + "auxiliary_loss_clip": 0.01128404, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.05643725, + "balance_loss_mlp": 1.02615535, + "epoch": 0.28389346479889377, + "flos": 18296666830080.0, + "grad_norm": 1.9874025011676186, + "language_loss": 0.76633704, + "learning_rate": 3.361215371711649e-06, + "loss": 0.78797132, + "num_input_tokens_seen": 50592415, + "step": 2361, + "time_per_iteration": 2.6237235069274902 + }, + { + "auxiliary_loss_clip": 0.01155492, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.05587792, + "balance_loss_mlp": 1.02279377, + "epoch": 0.2840137076895329, + "flos": 20406992728320.0, + "grad_norm": 1.9902492788309643, + "language_loss": 0.83466303, + "learning_rate": 3.3606445564132326e-06, + "loss": 0.8565309, + "num_input_tokens_seen": 50609710, + "step": 2362, + "time_per_iteration": 2.636603832244873 + }, + { + "auxiliary_loss_clip": 0.01208778, + "auxiliary_loss_mlp": 0.00764205, + "balance_loss_clip": 1.06684673, + "balance_loss_mlp": 1.00074553, + "epoch": 0.28413395058017193, + "flos": 20048030161920.0, + "grad_norm": 1.960051778617603, + "language_loss": 0.82106245, + "learning_rate": 3.360073534706118e-06, + "loss": 0.8407923, + "num_input_tokens_seen": 50626865, + "step": 2363, + "time_per_iteration": 2.5006539821624756 + }, + { + "auxiliary_loss_clip": 0.0117748, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.05975437, + "balance_loss_mlp": 1.01862371, + "epoch": 0.28425419347081105, + "flos": 37663613256960.0, + "grad_norm": 2.5619835828188657, + "language_loss": 0.75949162, + "learning_rate": 3.35950230667693e-06, + "loss": 0.78154165, + "num_input_tokens_seen": 50648560, + "step": 2364, + "time_per_iteration": 3.575005292892456 + }, + { + "auxiliary_loss_clip": 0.01193141, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.06227612, + "balance_loss_mlp": 1.01805031, + "epoch": 0.28437443636145016, + "flos": 13845072539520.0, + "grad_norm": 2.300519316214218, + "language_loss": 0.85227096, + "learning_rate": 3.358930872412323e-06, + "loss": 0.87446904, + "num_input_tokens_seen": 50665725, + "step": 2365, + "time_per_iteration": 2.4773616790771484 + }, + { + "auxiliary_loss_clip": 0.0118766, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.06158233, + "balance_loss_mlp": 1.02386665, + "epoch": 0.2844946792520892, + "flos": 22747794243840.0, + "grad_norm": 1.7623387457713378, + "language_loss": 0.80856657, + "learning_rate": 3.3583592319989825e-06, + "loss": 0.83077353, + "num_input_tokens_seen": 50685095, + "step": 2366, + "time_per_iteration": 2.582291603088379 + }, + { + "auxiliary_loss_clip": 0.01198795, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.06252277, + "balance_loss_mlp": 1.02849197, + "epoch": 0.2846149221427283, + "flos": 32415987709440.0, + "grad_norm": 2.1967308208184906, + "language_loss": 0.68716174, + "learning_rate": 3.357787385523627e-06, + "loss": 0.70952964, + "num_input_tokens_seen": 50706500, + "step": 2367, + "time_per_iteration": 2.585573196411133 + }, + { + "auxiliary_loss_clip": 0.01141871, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.05591512, + "balance_loss_mlp": 1.02290118, + "epoch": 0.2847351650333674, + "flos": 28475976873600.0, + "grad_norm": 2.025690913299581, + "language_loss": 0.82715809, + "learning_rate": 3.3572153330730048e-06, + "loss": 0.84889078, + "num_input_tokens_seen": 50727595, + "step": 2368, + "time_per_iteration": 2.7367939949035645 + }, + { + "auxiliary_loss_clip": 0.01094788, + "auxiliary_loss_mlp": 0.01001877, + "balance_loss_clip": 1.04459929, + "balance_loss_mlp": 1.00039852, + "epoch": 0.2848554079240065, + "flos": 55753399704960.0, + "grad_norm": 0.8242773002180156, + "language_loss": 0.64665413, + "learning_rate": 3.3566430747338956e-06, + "loss": 0.66762078, + "num_input_tokens_seen": 50782800, + "step": 2369, + "time_per_iteration": 3.959866762161255 + }, + { + "auxiliary_loss_clip": 0.01192984, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.06040013, + "balance_loss_mlp": 1.0227232, + "epoch": 0.2849756508146456, + "flos": 11836875985920.0, + "grad_norm": 4.972296229661133, + "language_loss": 0.86422235, + "learning_rate": 3.35607061059311e-06, + "loss": 0.88647127, + "num_input_tokens_seen": 50797730, + "step": 2370, + "time_per_iteration": 3.390984296798706 + }, + { + "auxiliary_loss_clip": 0.01201395, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.06159592, + "balance_loss_mlp": 1.02454221, + "epoch": 0.28509589370528465, + "flos": 25155209531520.0, + "grad_norm": 1.8411600341994314, + "language_loss": 0.75162351, + "learning_rate": 3.3554979407374917e-06, + "loss": 0.77396846, + "num_input_tokens_seen": 50819840, + "step": 2371, + "time_per_iteration": 2.599081039428711 + }, + { + "auxiliary_loss_clip": 0.01192201, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.06195831, + "balance_loss_mlp": 1.02445197, + "epoch": 0.28521613659592376, + "flos": 19974808287360.0, + "grad_norm": 2.168944280064147, + "language_loss": 0.73397607, + "learning_rate": 3.3549250652539134e-06, + "loss": 0.75623071, + "num_input_tokens_seen": 50838935, + "step": 2372, + "time_per_iteration": 2.517530679702759 + }, + { + "auxiliary_loss_clip": 0.01175333, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.05793881, + "balance_loss_mlp": 1.02278793, + "epoch": 0.2853363794865629, + "flos": 23367971491200.0, + "grad_norm": 1.8746988481138647, + "language_loss": 0.81731111, + "learning_rate": 3.3543519842292794e-06, + "loss": 0.8393873, + "num_input_tokens_seen": 50858590, + "step": 2373, + "time_per_iteration": 2.567857265472412 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.00763809, + "balance_loss_clip": 1.06479335, + "balance_loss_mlp": 1.0007211, + "epoch": 0.28545662237720193, + "flos": 19861940776320.0, + "grad_norm": 1.744301002779992, + "language_loss": 0.83457518, + "learning_rate": 3.353778697750527e-06, + "loss": 0.85427928, + "num_input_tokens_seen": 50876995, + "step": 2374, + "time_per_iteration": 2.4623165130615234 + }, + { + "auxiliary_loss_clip": 0.011677, + "auxiliary_loss_mlp": 0.01027711, + "balance_loss_clip": 1.05745029, + "balance_loss_mlp": 1.01866889, + "epoch": 0.28557686526784104, + "flos": 23879016241920.0, + "grad_norm": 1.9550237495053546, + "language_loss": 0.89293051, + "learning_rate": 3.353205205904622e-06, + "loss": 0.91488457, + "num_input_tokens_seen": 50896105, + "step": 2375, + "time_per_iteration": 2.658015727996826 + }, + { + "auxiliary_loss_clip": 0.01176211, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.059587, + "balance_loss_mlp": 1.02300668, + "epoch": 0.28569710815848015, + "flos": 44890384233600.0, + "grad_norm": 2.1578884171303163, + "language_loss": 0.72087818, + "learning_rate": 3.3526315087785637e-06, + "loss": 0.74295843, + "num_input_tokens_seen": 50917220, + "step": 2376, + "time_per_iteration": 2.7428653240203857 + }, + { + "auxiliary_loss_clip": 0.01125875, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.05305409, + "balance_loss_mlp": 1.02302682, + "epoch": 0.2858173510491192, + "flos": 26829759628800.0, + "grad_norm": 1.6070539904570438, + "language_loss": 0.80692399, + "learning_rate": 3.3520576064593805e-06, + "loss": 0.82850266, + "num_input_tokens_seen": 50937175, + "step": 2377, + "time_per_iteration": 2.633937358856201 + }, + { + "auxiliary_loss_clip": 0.01194351, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.06182778, + "balance_loss_mlp": 1.01830363, + "epoch": 0.2859375939397583, + "flos": 23148916398720.0, + "grad_norm": 1.7961619675202265, + "language_loss": 0.81772435, + "learning_rate": 3.3514834990341337e-06, + "loss": 0.8399421, + "num_input_tokens_seen": 50957500, + "step": 2378, + "time_per_iteration": 2.606462240219116 + }, + { + "auxiliary_loss_clip": 0.01184098, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.0625329, + "balance_loss_mlp": 1.0225054, + "epoch": 0.2860578368303974, + "flos": 12129799397760.0, + "grad_norm": 3.7731482053835363, + "language_loss": 0.92979473, + "learning_rate": 3.3509091865899144e-06, + "loss": 0.95194387, + "num_input_tokens_seen": 50972690, + "step": 2379, + "time_per_iteration": 2.6021652221679688 + }, + { + "auxiliary_loss_clip": 0.01205216, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.06268048, + "balance_loss_mlp": 1.02290678, + "epoch": 0.2861780797210365, + "flos": 19938035738880.0, + "grad_norm": 2.329486631853123, + "language_loss": 0.7042048, + "learning_rate": 3.350334669213846e-06, + "loss": 0.72657758, + "num_input_tokens_seen": 50990095, + "step": 2380, + "time_per_iteration": 2.533942937850952 + }, + { + "auxiliary_loss_clip": 0.01187849, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.06122661, + "balance_loss_mlp": 1.02572083, + "epoch": 0.2862983226116756, + "flos": 27563127609600.0, + "grad_norm": 1.9372811296714165, + "language_loss": 0.75630999, + "learning_rate": 3.3497599469930816e-06, + "loss": 0.77852774, + "num_input_tokens_seen": 51008305, + "step": 2381, + "time_per_iteration": 2.552480936050415 + }, + { + "auxiliary_loss_clip": 0.0120487, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.0609355, + "balance_loss_mlp": 1.02201271, + "epoch": 0.28641856550231465, + "flos": 22053964158720.0, + "grad_norm": 2.169885444424467, + "language_loss": 0.83164561, + "learning_rate": 3.349185020014807e-06, + "loss": 0.85400754, + "num_input_tokens_seen": 51025570, + "step": 2382, + "time_per_iteration": 2.498779535293579 + }, + { + "auxiliary_loss_clip": 0.01193425, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.06109154, + "balance_loss_mlp": 1.02130163, + "epoch": 0.28653880839295376, + "flos": 22378775869440.0, + "grad_norm": 2.104845382266116, + "language_loss": 0.74216336, + "learning_rate": 3.348609888366237e-06, + "loss": 0.76439625, + "num_input_tokens_seen": 51044585, + "step": 2383, + "time_per_iteration": 2.539055824279785 + }, + { + "auxiliary_loss_clip": 0.01127379, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.05329919, + "balance_loss_mlp": 1.01644325, + "epoch": 0.28665905128359287, + "flos": 23367971491200.0, + "grad_norm": 2.32982882217772, + "language_loss": 0.62526208, + "learning_rate": 3.348034552134619e-06, + "loss": 0.64679199, + "num_input_tokens_seen": 51063990, + "step": 2384, + "time_per_iteration": 2.6518490314483643 + }, + { + "auxiliary_loss_clip": 0.01140508, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.05743515, + "balance_loss_mlp": 1.02445817, + "epoch": 0.2867792941742319, + "flos": 20881695893760.0, + "grad_norm": 1.9825600712093907, + "language_loss": 0.83897316, + "learning_rate": 3.3474590114072316e-06, + "loss": 0.86070657, + "num_input_tokens_seen": 51081990, + "step": 2385, + "time_per_iteration": 2.6022582054138184 + }, + { + "auxiliary_loss_clip": 0.0116075, + "auxiliary_loss_mlp": 0.0103603, + "balance_loss_clip": 1.06148148, + "balance_loss_mlp": 1.02677917, + "epoch": 0.28689953706487104, + "flos": 20664005518080.0, + "grad_norm": 2.4936825176014845, + "language_loss": 0.82716608, + "learning_rate": 3.3468832662713836e-06, + "loss": 0.84913391, + "num_input_tokens_seen": 51100235, + "step": 2386, + "time_per_iteration": 2.5869791507720947 + }, + { + "auxiliary_loss_clip": 0.01155358, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.05651855, + "balance_loss_mlp": 1.02873945, + "epoch": 0.28701977995551015, + "flos": 12675533708160.0, + "grad_norm": 2.1766105322883322, + "language_loss": 0.83761138, + "learning_rate": 3.346307316814415e-06, + "loss": 0.85954225, + "num_input_tokens_seen": 51115405, + "step": 2387, + "time_per_iteration": 2.56998348236084 + }, + { + "auxiliary_loss_clip": 0.0119017, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.06199944, + "balance_loss_mlp": 1.02121699, + "epoch": 0.2871400228461492, + "flos": 21252366293760.0, + "grad_norm": 2.1630717103264665, + "language_loss": 0.75839806, + "learning_rate": 3.3457311631236965e-06, + "loss": 0.78060466, + "num_input_tokens_seen": 51136390, + "step": 2388, + "time_per_iteration": 2.5333447456359863 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.05806637, + "balance_loss_mlp": 1.02139854, + "epoch": 0.2872602657367883, + "flos": 25119262995840.0, + "grad_norm": 1.755746825594979, + "language_loss": 0.84420305, + "learning_rate": 3.345154805286631e-06, + "loss": 0.86614358, + "num_input_tokens_seen": 51156650, + "step": 2389, + "time_per_iteration": 2.6632096767425537 + }, + { + "auxiliary_loss_clip": 0.01186223, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.06000936, + "balance_loss_mlp": 1.02463937, + "epoch": 0.2873805086274274, + "flos": 16646606830080.0, + "grad_norm": 2.4479971218267007, + "language_loss": 0.76206875, + "learning_rate": 3.344578243390651e-06, + "loss": 0.78426957, + "num_input_tokens_seen": 51172210, + "step": 2390, + "time_per_iteration": 2.51564884185791 + }, + { + "auxiliary_loss_clip": 0.0117361, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.0606041, + "balance_loss_mlp": 1.02313328, + "epoch": 0.2875007515180665, + "flos": 17420123237760.0, + "grad_norm": 3.1989744779227776, + "language_loss": 0.78559852, + "learning_rate": 3.3440014775232206e-06, + "loss": 0.80766124, + "num_input_tokens_seen": 51190265, + "step": 2391, + "time_per_iteration": 3.3988842964172363 + }, + { + "auxiliary_loss_clip": 0.01164769, + "auxiliary_loss_mlp": 0.01030922, + "balance_loss_clip": 1.05933332, + "balance_loss_mlp": 1.02253604, + "epoch": 0.2876209944087056, + "flos": 23434190213760.0, + "grad_norm": 2.9111860552634603, + "language_loss": 0.70960832, + "learning_rate": 3.343424507771834e-06, + "loss": 0.73156524, + "num_input_tokens_seen": 51208475, + "step": 2392, + "time_per_iteration": 2.6180832386016846 + }, + { + "auxiliary_loss_clip": 0.01160356, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.05859554, + "balance_loss_mlp": 1.020329, + "epoch": 0.2877412372993447, + "flos": 13735509079680.0, + "grad_norm": 1.7787883190164946, + "language_loss": 0.8678295, + "learning_rate": 3.342847334224018e-06, + "loss": 0.8897205, + "num_input_tokens_seen": 51225875, + "step": 2393, + "time_per_iteration": 2.6140360832214355 + }, + { + "auxiliary_loss_clip": 0.01110196, + "auxiliary_loss_mlp": 0.01006328, + "balance_loss_clip": 1.04217052, + "balance_loss_mlp": 1.0049628, + "epoch": 0.28786148018998375, + "flos": 58079695104000.0, + "grad_norm": 0.9416801896126653, + "language_loss": 0.62427354, + "learning_rate": 3.342269956967329e-06, + "loss": 0.64543879, + "num_input_tokens_seen": 51287780, + "step": 2394, + "time_per_iteration": 3.1465885639190674 + }, + { + "auxiliary_loss_clip": 0.01196602, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.06365514, + "balance_loss_mlp": 1.02614737, + "epoch": 0.28798172308062286, + "flos": 23435052140160.0, + "grad_norm": 2.5055395850598643, + "language_loss": 0.71856058, + "learning_rate": 3.341692376089355e-06, + "loss": 0.74089205, + "num_input_tokens_seen": 51303335, + "step": 2395, + "time_per_iteration": 3.4253551959991455 + }, + { + "auxiliary_loss_clip": 0.01189538, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.06328869, + "balance_loss_mlp": 1.02490187, + "epoch": 0.288101965971262, + "flos": 25110033200640.0, + "grad_norm": 4.208070991132828, + "language_loss": 0.84362888, + "learning_rate": 3.3411145916777146e-06, + "loss": 0.86586249, + "num_input_tokens_seen": 51317495, + "step": 2396, + "time_per_iteration": 3.314282178878784 + }, + { + "auxiliary_loss_clip": 0.01168151, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.05717874, + "balance_loss_mlp": 1.02188182, + "epoch": 0.28822220886190103, + "flos": 16252559654400.0, + "grad_norm": 7.363153001333871, + "language_loss": 0.90462404, + "learning_rate": 3.3405366038200566e-06, + "loss": 0.92661756, + "num_input_tokens_seen": 51336430, + "step": 2397, + "time_per_iteration": 3.2972218990325928 + }, + { + "auxiliary_loss_clip": 0.01180515, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.06512213, + "balance_loss_mlp": 1.03036833, + "epoch": 0.28834245175254014, + "flos": 24535642815360.0, + "grad_norm": 2.316113287464621, + "language_loss": 0.8474431, + "learning_rate": 3.3399584126040617e-06, + "loss": 0.86964619, + "num_input_tokens_seen": 51355930, + "step": 2398, + "time_per_iteration": 2.5883123874664307 + }, + { + "auxiliary_loss_clip": 0.01205295, + "auxiliary_loss_mlp": 0.00763223, + "balance_loss_clip": 1.06315017, + "balance_loss_mlp": 1.00055063, + "epoch": 0.2884626946431792, + "flos": 24571445696640.0, + "grad_norm": 1.895950222794461, + "language_loss": 0.90676975, + "learning_rate": 3.339380018117441e-06, + "loss": 0.92645496, + "num_input_tokens_seen": 51376765, + "step": 2399, + "time_per_iteration": 2.533411979675293 + }, + { + "auxiliary_loss_clip": 0.01188791, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.0625298, + "balance_loss_mlp": 1.02110028, + "epoch": 0.2885829375338183, + "flos": 16544657053440.0, + "grad_norm": 2.5877069647403452, + "language_loss": 0.78251559, + "learning_rate": 3.3388014204479366e-06, + "loss": 0.80470169, + "num_input_tokens_seen": 51394570, + "step": 2400, + "time_per_iteration": 2.5092945098876953 + }, + { + "auxiliary_loss_clip": 0.01210963, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.06550336, + "balance_loss_mlp": 1.02107632, + "epoch": 0.2887031804244574, + "flos": 24061226958720.0, + "grad_norm": 2.6541472970759554, + "language_loss": 0.91612792, + "learning_rate": 3.338222619683321e-06, + "loss": 0.9385367, + "num_input_tokens_seen": 51414535, + "step": 2401, + "time_per_iteration": 2.5581748485565186 + }, + { + "auxiliary_loss_clip": 0.011797, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.06140471, + "balance_loss_mlp": 1.01964736, + "epoch": 0.2888234233150965, + "flos": 23330696152320.0, + "grad_norm": 2.4906168921043914, + "language_loss": 0.73581588, + "learning_rate": 3.337643615911398e-06, + "loss": 0.75790274, + "num_input_tokens_seen": 51434160, + "step": 2402, + "time_per_iteration": 2.5718328952789307 + }, + { + "auxiliary_loss_clip": 0.01192058, + "auxiliary_loss_mlp": 0.01028843, + "balance_loss_clip": 1.06080925, + "balance_loss_mlp": 1.01890087, + "epoch": 0.2889436662057356, + "flos": 22272767856000.0, + "grad_norm": 2.117520059593653, + "language_loss": 0.7878207, + "learning_rate": 3.3370644092200026e-06, + "loss": 0.81002975, + "num_input_tokens_seen": 51451435, + "step": 2403, + "time_per_iteration": 2.558042526245117 + }, + { + "auxiliary_loss_clip": 0.01145025, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.05169439, + "balance_loss_mlp": 1.02333522, + "epoch": 0.2890639090963747, + "flos": 21616931381760.0, + "grad_norm": 1.7973179924388445, + "language_loss": 0.78104806, + "learning_rate": 3.3364849996969985e-06, + "loss": 0.80282485, + "num_input_tokens_seen": 51471455, + "step": 2404, + "time_per_iteration": 2.6071979999542236 + }, + { + "auxiliary_loss_clip": 0.01188986, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.06104934, + "balance_loss_mlp": 1.02102888, + "epoch": 0.28918415198701375, + "flos": 28585540333440.0, + "grad_norm": 1.806707488056981, + "language_loss": 0.8532275, + "learning_rate": 3.335905387430283e-06, + "loss": 0.87541926, + "num_input_tokens_seen": 51492890, + "step": 2405, + "time_per_iteration": 2.5849053859710693 + }, + { + "auxiliary_loss_clip": 0.01178508, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.05829358, + "balance_loss_mlp": 1.02399671, + "epoch": 0.28930439487765286, + "flos": 21944688007680.0, + "grad_norm": 1.7859010656047312, + "language_loss": 0.83107346, + "learning_rate": 3.335325572507782e-06, + "loss": 0.85319066, + "num_input_tokens_seen": 51513390, + "step": 2406, + "time_per_iteration": 2.5744731426239014 + }, + { + "auxiliary_loss_clip": 0.01210634, + "auxiliary_loss_mlp": 0.00764099, + "balance_loss_clip": 1.06816113, + "balance_loss_mlp": 1.00064349, + "epoch": 0.28942463776829197, + "flos": 19281911955840.0, + "grad_norm": 1.6883569413838524, + "language_loss": 0.73805565, + "learning_rate": 3.3347455550174537e-06, + "loss": 0.75780296, + "num_input_tokens_seen": 51532730, + "step": 2407, + "time_per_iteration": 2.475522518157959 + }, + { + "auxiliary_loss_clip": 0.01157693, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.05476999, + "balance_loss_mlp": 1.024405, + "epoch": 0.289544880658931, + "flos": 14645700737280.0, + "grad_norm": 2.0141154729951216, + "language_loss": 0.68109119, + "learning_rate": 3.3341653350472864e-06, + "loss": 0.70301205, + "num_input_tokens_seen": 51549560, + "step": 2408, + "time_per_iteration": 2.5330286026000977 + }, + { + "auxiliary_loss_clip": 0.01214259, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.06336641, + "balance_loss_mlp": 1.02283239, + "epoch": 0.28966512354957014, + "flos": 28621881918720.0, + "grad_norm": 2.3235009440326757, + "language_loss": 0.69177634, + "learning_rate": 3.333584912685298e-06, + "loss": 0.71425337, + "num_input_tokens_seen": 51568180, + "step": 2409, + "time_per_iteration": 2.5282223224639893 + }, + { + "auxiliary_loss_clip": 0.01079926, + "auxiliary_loss_mlp": 0.01009002, + "balance_loss_clip": 1.03628111, + "balance_loss_mlp": 1.00772655, + "epoch": 0.28978536644020925, + "flos": 64711784511360.0, + "grad_norm": 0.9681170968111037, + "language_loss": 0.55545115, + "learning_rate": 3.3330042880195385e-06, + "loss": 0.57634044, + "num_input_tokens_seen": 51622530, + "step": 2410, + "time_per_iteration": 3.09198260307312 + }, + { + "auxiliary_loss_clip": 0.01176145, + "auxiliary_loss_mlp": 0.01028157, + "balance_loss_clip": 1.05848563, + "balance_loss_mlp": 1.01870334, + "epoch": 0.2899056093308483, + "flos": 18624638937600.0, + "grad_norm": 1.93755332741465, + "language_loss": 0.78646594, + "learning_rate": 3.3324234611380888e-06, + "loss": 0.80850899, + "num_input_tokens_seen": 51641260, + "step": 2411, + "time_per_iteration": 2.5475380420684814 + }, + { + "auxiliary_loss_clip": 0.01151313, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.05500996, + "balance_loss_mlp": 1.02390361, + "epoch": 0.2900258522214874, + "flos": 22893735202560.0, + "grad_norm": 2.657662894469066, + "language_loss": 0.81773156, + "learning_rate": 3.3318424321290596e-06, + "loss": 0.83957249, + "num_input_tokens_seen": 51660975, + "step": 2412, + "time_per_iteration": 2.63137149810791 + }, + { + "auxiliary_loss_clip": 0.01076642, + "auxiliary_loss_mlp": 0.0100363, + "balance_loss_clip": 1.03442025, + "balance_loss_mlp": 1.00226521, + "epoch": 0.2901460951121265, + "flos": 71106036013440.0, + "grad_norm": 0.8528914580462031, + "language_loss": 0.59982955, + "learning_rate": 3.3312612010805917e-06, + "loss": 0.62063229, + "num_input_tokens_seen": 51720550, + "step": 2413, + "time_per_iteration": 3.1731765270233154 + }, + { + "auxiliary_loss_clip": 0.01162893, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.05737257, + "balance_loss_mlp": 1.02894998, + "epoch": 0.2902663380027656, + "flos": 32160986081280.0, + "grad_norm": 1.713133232171617, + "language_loss": 0.70072764, + "learning_rate": 3.330679768080858e-06, + "loss": 0.72274065, + "num_input_tokens_seen": 51744435, + "step": 2414, + "time_per_iteration": 2.6292827129364014 + }, + { + "auxiliary_loss_clip": 0.01190576, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.06367862, + "balance_loss_mlp": 1.02460098, + "epoch": 0.2903865808934047, + "flos": 29351658539520.0, + "grad_norm": 2.024128948640041, + "language_loss": 0.83503389, + "learning_rate": 3.3300981332180627e-06, + "loss": 0.8572855, + "num_input_tokens_seen": 51763640, + "step": 2415, + "time_per_iteration": 2.5794622898101807 + }, + { + "auxiliary_loss_clip": 0.01167465, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.05756402, + "balance_loss_mlp": 1.01902103, + "epoch": 0.29050682378404374, + "flos": 17089026647040.0, + "grad_norm": 2.137085589400161, + "language_loss": 0.80126452, + "learning_rate": 3.3295162965804373e-06, + "loss": 0.82321966, + "num_input_tokens_seen": 51782135, + "step": 2416, + "time_per_iteration": 2.5965566635131836 + }, + { + "auxiliary_loss_clip": 0.01156201, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.05722833, + "balance_loss_mlp": 1.01815963, + "epoch": 0.29062706667468285, + "flos": 17858233422720.0, + "grad_norm": 2.0182079626067235, + "language_loss": 0.78811514, + "learning_rate": 3.328934258256247e-06, + "loss": 0.80995113, + "num_input_tokens_seen": 51800200, + "step": 2417, + "time_per_iteration": 3.513564348220825 + }, + { + "auxiliary_loss_clip": 0.0119104, + "auxiliary_loss_mlp": 0.01030553, + "balance_loss_clip": 1.05934966, + "balance_loss_mlp": 1.02039635, + "epoch": 0.29074730956532197, + "flos": 24279815174400.0, + "grad_norm": 2.024376027443427, + "language_loss": 0.67472589, + "learning_rate": 3.3283520183337856e-06, + "loss": 0.69694179, + "num_input_tokens_seen": 51819905, + "step": 2418, + "time_per_iteration": 2.5593769550323486 + }, + { + "auxiliary_loss_clip": 0.01172277, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.05761409, + "balance_loss_mlp": 1.02201653, + "epoch": 0.290867552455961, + "flos": 22340961826560.0, + "grad_norm": 1.5750965800784684, + "language_loss": 0.68862599, + "learning_rate": 3.3277695769013797e-06, + "loss": 0.71066308, + "num_input_tokens_seen": 51839350, + "step": 2419, + "time_per_iteration": 2.5420775413513184 + }, + { + "auxiliary_loss_clip": 0.01194084, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.06259513, + "balance_loss_mlp": 1.02207494, + "epoch": 0.29098779534660013, + "flos": 23186155824000.0, + "grad_norm": 2.1426285722053944, + "language_loss": 0.77466422, + "learning_rate": 3.327186934047385e-06, + "loss": 0.79691982, + "num_input_tokens_seen": 51858045, + "step": 2420, + "time_per_iteration": 2.5408473014831543 + }, + { + "auxiliary_loss_clip": 0.01163673, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.05411804, + "balance_loss_mlp": 1.02470434, + "epoch": 0.29110803823723924, + "flos": 15304194817920.0, + "grad_norm": 1.7916491406087527, + "language_loss": 0.65689802, + "learning_rate": 3.3266040898601877e-06, + "loss": 0.67887473, + "num_input_tokens_seen": 51875880, + "step": 2421, + "time_per_iteration": 2.517857313156128 + }, + { + "auxiliary_loss_clip": 0.01140338, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.05222094, + "balance_loss_mlp": 1.0214932, + "epoch": 0.2912282811278783, + "flos": 22595352923520.0, + "grad_norm": 2.053182994260173, + "language_loss": 0.77941227, + "learning_rate": 3.3260210444282045e-06, + "loss": 0.80112201, + "num_input_tokens_seen": 51893835, + "step": 2422, + "time_per_iteration": 3.5465195178985596 + }, + { + "auxiliary_loss_clip": 0.01186018, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.06199372, + "balance_loss_mlp": 1.02566361, + "epoch": 0.2913485240185174, + "flos": 24497900599680.0, + "grad_norm": 2.136328364839051, + "language_loss": 0.73118502, + "learning_rate": 3.325437797839883e-06, + "loss": 0.75339574, + "num_input_tokens_seen": 51912205, + "step": 2423, + "time_per_iteration": 3.264315128326416 + }, + { + "auxiliary_loss_clip": 0.01204405, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.06104994, + "balance_loss_mlp": 1.0247376, + "epoch": 0.2914687669091565, + "flos": 17931024334080.0, + "grad_norm": 2.105803453504304, + "language_loss": 0.74570858, + "learning_rate": 3.3248543501837015e-06, + "loss": 0.76809347, + "num_input_tokens_seen": 51929410, + "step": 2424, + "time_per_iteration": 2.444683790206909 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.05951738, + "balance_loss_mlp": 1.0239495, + "epoch": 0.2915890097997956, + "flos": 22529313768960.0, + "grad_norm": 1.7549213412504703, + "language_loss": 0.77552181, + "learning_rate": 3.3242707015481684e-06, + "loss": 0.79740012, + "num_input_tokens_seen": 51949345, + "step": 2425, + "time_per_iteration": 2.625549077987671 + }, + { + "auxiliary_loss_clip": 0.01171152, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.05472851, + "balance_loss_mlp": 1.02218342, + "epoch": 0.2917092526904347, + "flos": 13845216193920.0, + "grad_norm": 1.637159052522053, + "language_loss": 0.80268264, + "learning_rate": 3.323686852021823e-06, + "loss": 0.82470036, + "num_input_tokens_seen": 51966855, + "step": 2426, + "time_per_iteration": 2.5675816535949707 + }, + { + "auxiliary_loss_clip": 0.0116416, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.05450225, + "balance_loss_mlp": 1.02187288, + "epoch": 0.2918294955810738, + "flos": 22674859678080.0, + "grad_norm": 2.3121532436044405, + "language_loss": 0.79587591, + "learning_rate": 3.323102801693235e-06, + "loss": 0.8178308, + "num_input_tokens_seen": 51985620, + "step": 2427, + "time_per_iteration": 2.65246319770813 + }, + { + "auxiliary_loss_clip": 0.01182783, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.05762768, + "balance_loss_mlp": 1.02089453, + "epoch": 0.29194973847171285, + "flos": 23438284364160.0, + "grad_norm": 2.4546048710733985, + "language_loss": 0.80223823, + "learning_rate": 3.322518550651003e-06, + "loss": 0.82436955, + "num_input_tokens_seen": 52004930, + "step": 2428, + "time_per_iteration": 2.5681416988372803 + }, + { + "auxiliary_loss_clip": 0.01180822, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.05698776, + "balance_loss_mlp": 1.02663839, + "epoch": 0.29206998136235196, + "flos": 21909064694400.0, + "grad_norm": 2.137436896155721, + "language_loss": 0.81402034, + "learning_rate": 3.3219340989837586e-06, + "loss": 0.83618689, + "num_input_tokens_seen": 52024920, + "step": 2429, + "time_per_iteration": 2.661130666732788 + }, + { + "auxiliary_loss_clip": 0.01174282, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.05909109, + "balance_loss_mlp": 1.02479947, + "epoch": 0.292190224252991, + "flos": 23215925220480.0, + "grad_norm": 1.9097660838340935, + "language_loss": 0.80582821, + "learning_rate": 3.3213494467801625e-06, + "loss": 0.82790542, + "num_input_tokens_seen": 52044095, + "step": 2430, + "time_per_iteration": 2.6785995960235596 + }, + { + "auxiliary_loss_clip": 0.01113519, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.04592931, + "balance_loss_mlp": 1.02052474, + "epoch": 0.2923104671436301, + "flos": 20740818752640.0, + "grad_norm": 1.8227797343507777, + "language_loss": 0.71225917, + "learning_rate": 3.3207645941289063e-06, + "loss": 0.73369032, + "num_input_tokens_seen": 52062440, + "step": 2431, + "time_per_iteration": 2.6438469886779785 + }, + { + "auxiliary_loss_clip": 0.01191784, + "auxiliary_loss_mlp": 0.00763923, + "balance_loss_clip": 1.06377077, + "balance_loss_mlp": 1.0002867, + "epoch": 0.29243071003426924, + "flos": 35809114999680.0, + "grad_norm": 1.7465996955748733, + "language_loss": 0.80065852, + "learning_rate": 3.320179541118711e-06, + "loss": 0.82021558, + "num_input_tokens_seen": 52084940, + "step": 2432, + "time_per_iteration": 2.6420559883117676 + }, + { + "auxiliary_loss_clip": 0.01100604, + "auxiliary_loss_mlp": 0.01003887, + "balance_loss_clip": 1.03223062, + "balance_loss_mlp": 1.00246227, + "epoch": 0.2925509529249083, + "flos": 58081598524800.0, + "grad_norm": 1.0249214197562737, + "language_loss": 0.60312903, + "learning_rate": 3.3195942878383293e-06, + "loss": 0.62417388, + "num_input_tokens_seen": 52141040, + "step": 2433, + "time_per_iteration": 3.103043556213379 + }, + { + "auxiliary_loss_clip": 0.01193019, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.06189704, + "balance_loss_mlp": 1.02141118, + "epoch": 0.2926711958155474, + "flos": 21397122103680.0, + "grad_norm": 1.9670036998358922, + "language_loss": 0.77871996, + "learning_rate": 3.319008834376543e-06, + "loss": 0.80096328, + "num_input_tokens_seen": 52160730, + "step": 2434, + "time_per_iteration": 2.486548662185669 + }, + { + "auxiliary_loss_clip": 0.01164531, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.05157483, + "balance_loss_mlp": 1.01607037, + "epoch": 0.2927914387061865, + "flos": 23185796688000.0, + "grad_norm": 2.0954825512035, + "language_loss": 0.88984632, + "learning_rate": 3.3184231808221654e-06, + "loss": 0.91174316, + "num_input_tokens_seen": 52175055, + "step": 2435, + "time_per_iteration": 2.5504398345947266 + }, + { + "auxiliary_loss_clip": 0.01164275, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.06017447, + "balance_loss_mlp": 1.0284369, + "epoch": 0.29291168159682557, + "flos": 22455553190400.0, + "grad_norm": 1.8407655570802917, + "language_loss": 0.62755907, + "learning_rate": 3.3178373272640394e-06, + "loss": 0.64958179, + "num_input_tokens_seen": 52194150, + "step": 2436, + "time_per_iteration": 2.5636203289031982 + }, + { + "auxiliary_loss_clip": 0.01204314, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.06290472, + "balance_loss_mlp": 1.02752304, + "epoch": 0.2930319244874647, + "flos": 21170632896000.0, + "grad_norm": 2.177517658942623, + "language_loss": 0.85170913, + "learning_rate": 3.3172512737910387e-06, + "loss": 0.87411386, + "num_input_tokens_seen": 52211660, + "step": 2437, + "time_per_iteration": 2.451910972595215 + }, + { + "auxiliary_loss_clip": 0.01190476, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.05854487, + "balance_loss_mlp": 1.0245676, + "epoch": 0.2931521673781038, + "flos": 31357843931520.0, + "grad_norm": 2.8979296588964254, + "language_loss": 0.87840325, + "learning_rate": 3.3166650204920674e-06, + "loss": 0.90064681, + "num_input_tokens_seen": 52232830, + "step": 2438, + "time_per_iteration": 2.5606577396392822 + }, + { + "auxiliary_loss_clip": 0.0119167, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.06327248, + "balance_loss_mlp": 1.02433634, + "epoch": 0.29327241026874284, + "flos": 24200990778240.0, + "grad_norm": 1.5372924706964617, + "language_loss": 0.81654668, + "learning_rate": 3.316078567456059e-06, + "loss": 0.83880216, + "num_input_tokens_seen": 52250670, + "step": 2439, + "time_per_iteration": 2.5211644172668457 + }, + { + "auxiliary_loss_clip": 0.01136472, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.05421662, + "balance_loss_mlp": 1.02004027, + "epoch": 0.29339265315938196, + "flos": 24242611662720.0, + "grad_norm": 2.1372627892072105, + "language_loss": 0.75933146, + "learning_rate": 3.3154919147719786e-06, + "loss": 0.78097868, + "num_input_tokens_seen": 52271685, + "step": 2440, + "time_per_iteration": 2.6627840995788574 + }, + { + "auxiliary_loss_clip": 0.01190477, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.06052673, + "balance_loss_mlp": 1.02110076, + "epoch": 0.29351289605002107, + "flos": 16946641134720.0, + "grad_norm": 2.1196209954178333, + "language_loss": 0.86603832, + "learning_rate": 3.31490506252882e-06, + "loss": 0.88824677, + "num_input_tokens_seen": 52291065, + "step": 2441, + "time_per_iteration": 2.5005228519439697 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.05032372, + "balance_loss_mlp": 1.02397394, + "epoch": 0.2936331389406601, + "flos": 19829082810240.0, + "grad_norm": 1.8255698780371186, + "language_loss": 0.84314984, + "learning_rate": 3.31431801081561e-06, + "loss": 0.86495072, + "num_input_tokens_seen": 52310000, + "step": 2442, + "time_per_iteration": 2.536606788635254 + }, + { + "auxiliary_loss_clip": 0.01087172, + "auxiliary_loss_mlp": 0.01011709, + "balance_loss_clip": 1.03608942, + "balance_loss_mlp": 1.0104872, + "epoch": 0.29375338183129923, + "flos": 71416844398080.0, + "grad_norm": 0.8978043828204144, + "language_loss": 0.67893684, + "learning_rate": 3.313730759721402e-06, + "loss": 0.69992566, + "num_input_tokens_seen": 52372930, + "step": 2443, + "time_per_iteration": 3.2036445140838623 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.05867529, + "balance_loss_mlp": 1.02524877, + "epoch": 0.29387362472193834, + "flos": 22054502862720.0, + "grad_norm": 1.9646828932350573, + "language_loss": 0.86361015, + "learning_rate": 3.313143309335282e-06, + "loss": 0.88568711, + "num_input_tokens_seen": 52391420, + "step": 2444, + "time_per_iteration": 3.3804564476013184 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.05759907, + "balance_loss_mlp": 1.02468514, + "epoch": 0.2939938676125774, + "flos": 22966418373120.0, + "grad_norm": 1.9760051808641652, + "language_loss": 0.84948897, + "learning_rate": 3.3125556597463665e-06, + "loss": 0.87141418, + "num_input_tokens_seen": 52410725, + "step": 2445, + "time_per_iteration": 2.570573568344116 + }, + { + "auxiliary_loss_clip": 0.01187592, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.06216717, + "balance_loss_mlp": 1.02493179, + "epoch": 0.2941141105032165, + "flos": 31358705857920.0, + "grad_norm": 1.567619141861922, + "language_loss": 0.66339487, + "learning_rate": 3.311967811043801e-06, + "loss": 0.68560839, + "num_input_tokens_seen": 52432645, + "step": 2446, + "time_per_iteration": 2.623185873031616 + }, + { + "auxiliary_loss_clip": 0.01190521, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.06228948, + "balance_loss_mlp": 1.0221504, + "epoch": 0.29423435339385556, + "flos": 23222138273280.0, + "grad_norm": 2.5557226140536344, + "language_loss": 0.82255226, + "learning_rate": 3.3113797633167617e-06, + "loss": 0.84477198, + "num_input_tokens_seen": 52450940, + "step": 2447, + "time_per_iteration": 2.616151809692383 + }, + { + "auxiliary_loss_clip": 0.01203828, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.06127059, + "balance_loss_mlp": 1.02025044, + "epoch": 0.2943545962844947, + "flos": 26864054138880.0, + "grad_norm": 2.116558527049001, + "language_loss": 0.68866307, + "learning_rate": 3.310791516654455e-06, + "loss": 0.71100014, + "num_input_tokens_seen": 52468000, + "step": 2448, + "time_per_iteration": 3.415570020675659 + }, + { + "auxiliary_loss_clip": 0.01169073, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.05729854, + "balance_loss_mlp": 1.03240764, + "epoch": 0.2944748391751338, + "flos": 20231677422720.0, + "grad_norm": 1.825830585048486, + "language_loss": 0.79675436, + "learning_rate": 3.3102030711461177e-06, + "loss": 0.81887221, + "num_input_tokens_seen": 52487575, + "step": 2449, + "time_per_iteration": 4.269005537033081 + }, + { + "auxiliary_loss_clip": 0.01164726, + "auxiliary_loss_mlp": 0.01024555, + "balance_loss_clip": 1.05778742, + "balance_loss_mlp": 1.01538229, + "epoch": 0.29459508206577284, + "flos": 15960965045760.0, + "grad_norm": 1.8538415440303369, + "language_loss": 0.67971587, + "learning_rate": 3.3096144268810156e-06, + "loss": 0.70160866, + "num_input_tokens_seen": 52506335, + "step": 2450, + "time_per_iteration": 2.6147491931915283 + }, + { + "auxiliary_loss_clip": 0.01179106, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.05796123, + "balance_loss_mlp": 1.02250957, + "epoch": 0.29471532495641195, + "flos": 20412882558720.0, + "grad_norm": 1.9659645250214002, + "language_loss": 0.7295301, + "learning_rate": 3.3090255839484462e-06, + "loss": 0.75164026, + "num_input_tokens_seen": 52524330, + "step": 2451, + "time_per_iteration": 2.6420652866363525 + }, + { + "auxiliary_loss_clip": 0.01176882, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.05749226, + "balance_loss_mlp": 1.02075231, + "epoch": 0.29483556784705106, + "flos": 20376576887040.0, + "grad_norm": 1.925991110262362, + "language_loss": 0.85528541, + "learning_rate": 3.3084365424377366e-06, + "loss": 0.8773582, + "num_input_tokens_seen": 52543095, + "step": 2452, + "time_per_iteration": 2.5142624378204346 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.01008848, + "balance_loss_clip": 1.04353976, + "balance_loss_mlp": 1.00715494, + "epoch": 0.2949558107376901, + "flos": 68555660595840.0, + "grad_norm": 0.72802823671306, + "language_loss": 0.5594064, + "learning_rate": 3.307847302438245e-06, + "loss": 0.58022904, + "num_input_tokens_seen": 52597075, + "step": 2453, + "time_per_iteration": 3.0465087890625 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.04890275, + "balance_loss_mlp": 1.01953936, + "epoch": 0.2950760536283292, + "flos": 16107085572480.0, + "grad_norm": 2.246497937726557, + "language_loss": 0.78037846, + "learning_rate": 3.3072578640393562e-06, + "loss": 0.80196381, + "num_input_tokens_seen": 52614410, + "step": 2454, + "time_per_iteration": 2.612377405166626 + }, + { + "auxiliary_loss_clip": 0.01174708, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.05923843, + "balance_loss_mlp": 1.02053535, + "epoch": 0.29519629651896834, + "flos": 20483626394880.0, + "grad_norm": 1.8481795362133178, + "language_loss": 0.79444516, + "learning_rate": 3.3066682273304886e-06, + "loss": 0.81649226, + "num_input_tokens_seen": 52632055, + "step": 2455, + "time_per_iteration": 2.5631654262542725 + }, + { + "auxiliary_loss_clip": 0.01195319, + "auxiliary_loss_mlp": 0.0076407, + "balance_loss_clip": 1.06213617, + "balance_loss_mlp": 1.00032496, + "epoch": 0.2953165394096074, + "flos": 18916484941440.0, + "grad_norm": 1.9910736544904315, + "language_loss": 0.78806609, + "learning_rate": 3.3060783924010904e-06, + "loss": 0.80765998, + "num_input_tokens_seen": 52649980, + "step": 2456, + "time_per_iteration": 2.5193769931793213 + }, + { + "auxiliary_loss_clip": 0.01163171, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.05919313, + "balance_loss_mlp": 1.02501416, + "epoch": 0.2954367823002465, + "flos": 20624467622400.0, + "grad_norm": 2.471307453115101, + "language_loss": 0.85131407, + "learning_rate": 3.3054883593406387e-06, + "loss": 0.8732937, + "num_input_tokens_seen": 52664730, + "step": 2457, + "time_per_iteration": 2.5412800312042236 + }, + { + "auxiliary_loss_clip": 0.01176856, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.05884945, + "balance_loss_mlp": 1.02504992, + "epoch": 0.2955570251908856, + "flos": 31175525473920.0, + "grad_norm": 2.1421960615479567, + "language_loss": 0.65481204, + "learning_rate": 3.3048981282386404e-06, + "loss": 0.67692101, + "num_input_tokens_seen": 52686040, + "step": 2458, + "time_per_iteration": 2.6395134925842285 + }, + { + "auxiliary_loss_clip": 0.0114987, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.05724669, + "balance_loss_mlp": 1.02119303, + "epoch": 0.29567726808152467, + "flos": 21650328051840.0, + "grad_norm": 1.934980542574379, + "language_loss": 0.82540143, + "learning_rate": 3.304307699184634e-06, + "loss": 0.84719801, + "num_input_tokens_seen": 52704630, + "step": 2459, + "time_per_iteration": 2.562892436981201 + }, + { + "auxiliary_loss_clip": 0.01175545, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.06029212, + "balance_loss_mlp": 1.02565408, + "epoch": 0.2957975109721638, + "flos": 24243868638720.0, + "grad_norm": 1.7621834198712325, + "language_loss": 0.78859651, + "learning_rate": 3.3037170722681866e-06, + "loss": 0.81069744, + "num_input_tokens_seen": 52725465, + "step": 2460, + "time_per_iteration": 2.54333758354187 + }, + { + "auxiliary_loss_clip": 0.01156518, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.0592941, + "balance_loss_mlp": 1.02322483, + "epoch": 0.29591775386280283, + "flos": 13479717352320.0, + "grad_norm": 1.761281639389518, + "language_loss": 0.68101501, + "learning_rate": 3.3031262475788956e-06, + "loss": 0.70290643, + "num_input_tokens_seen": 52742405, + "step": 2461, + "time_per_iteration": 2.526418447494507 + }, + { + "auxiliary_loss_clip": 0.01173169, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.05831647, + "balance_loss_mlp": 1.02332461, + "epoch": 0.29603799675344195, + "flos": 17749783284480.0, + "grad_norm": 1.7226588489405246, + "language_loss": 0.73160857, + "learning_rate": 3.3025352252063897e-06, + "loss": 0.7536636, + "num_input_tokens_seen": 52761100, + "step": 2462, + "time_per_iteration": 2.5071654319763184 + }, + { + "auxiliary_loss_clip": 0.0118954, + "auxiliary_loss_mlp": 0.01038403, + "balance_loss_clip": 1.06385076, + "balance_loss_mlp": 1.02912831, + "epoch": 0.29615823964408106, + "flos": 22783920347520.0, + "grad_norm": 1.9316008937784697, + "language_loss": 0.7524932, + "learning_rate": 3.3019440052403252e-06, + "loss": 0.77477264, + "num_input_tokens_seen": 52780965, + "step": 2463, + "time_per_iteration": 2.5210492610931396 + }, + { + "auxiliary_loss_clip": 0.01176218, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.05787027, + "balance_loss_mlp": 1.021909, + "epoch": 0.2962784825347201, + "flos": 23514199758720.0, + "grad_norm": 1.9874792727442891, + "language_loss": 0.70831293, + "learning_rate": 3.30135258777039e-06, + "loss": 0.73038483, + "num_input_tokens_seen": 52800335, + "step": 2464, + "time_per_iteration": 2.548659086227417 + }, + { + "auxiliary_loss_clip": 0.01194284, + "auxiliary_loss_mlp": 0.00764029, + "balance_loss_clip": 1.06067729, + "balance_loss_mlp": 1.00035286, + "epoch": 0.2963987254253592, + "flos": 16362769559040.0, + "grad_norm": 2.0135302193605282, + "language_loss": 0.70604622, + "learning_rate": 3.3007609728863024e-06, + "loss": 0.72562933, + "num_input_tokens_seen": 52818425, + "step": 2465, + "time_per_iteration": 2.4830453395843506 + }, + { + "auxiliary_loss_clip": 0.01124545, + "auxiliary_loss_mlp": 0.01029915, + "balance_loss_clip": 1.05726361, + "balance_loss_mlp": 1.02039647, + "epoch": 0.29651896831599833, + "flos": 33472263980160.0, + "grad_norm": 2.0939213696398236, + "language_loss": 0.7299872, + "learning_rate": 3.300169160677809e-06, + "loss": 0.75153184, + "num_input_tokens_seen": 52842340, + "step": 2466, + "time_per_iteration": 2.7332284450531006 + }, + { + "auxiliary_loss_clip": 0.01168318, + "auxiliary_loss_mlp": 0.01027985, + "balance_loss_clip": 1.05975151, + "balance_loss_mlp": 1.018466, + "epoch": 0.2966392112066374, + "flos": 23805363404160.0, + "grad_norm": 2.252350045765582, + "language_loss": 0.7770915, + "learning_rate": 3.2995771512346878e-06, + "loss": 0.7990545, + "num_input_tokens_seen": 52860690, + "step": 2467, + "time_per_iteration": 2.5818231105804443 + }, + { + "auxiliary_loss_clip": 0.01208669, + "auxiliary_loss_mlp": 0.00764232, + "balance_loss_clip": 1.06461656, + "balance_loss_mlp": 1.00033641, + "epoch": 0.2967594540972765, + "flos": 19938466702080.0, + "grad_norm": 9.878362557482154, + "language_loss": 0.73227632, + "learning_rate": 3.298984944646746e-06, + "loss": 0.7520054, + "num_input_tokens_seen": 52879370, + "step": 2468, + "time_per_iteration": 2.4837563037872314 + }, + { + "auxiliary_loss_clip": 0.01193126, + "auxiliary_loss_mlp": 0.0076337, + "balance_loss_clip": 1.0625639, + "balance_loss_mlp": 1.00027835, + "epoch": 0.2968796969879156, + "flos": 23732823888000.0, + "grad_norm": 1.8758742520349645, + "language_loss": 0.81659961, + "learning_rate": 3.298392541003822e-06, + "loss": 0.83616459, + "num_input_tokens_seen": 52898775, + "step": 2469, + "time_per_iteration": 2.5382080078125 + }, + { + "auxiliary_loss_clip": 0.01174975, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.06108212, + "balance_loss_mlp": 1.02200603, + "epoch": 0.29699993987855466, + "flos": 22893699288960.0, + "grad_norm": 1.6177336234424458, + "language_loss": 0.89689589, + "learning_rate": 3.2977999403957806e-06, + "loss": 0.91895366, + "num_input_tokens_seen": 52917535, + "step": 2470, + "time_per_iteration": 3.4227635860443115 + }, + { + "auxiliary_loss_clip": 0.01206573, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.06557202, + "balance_loss_mlp": 1.02745628, + "epoch": 0.2971201827691938, + "flos": 33832555349760.0, + "grad_norm": 2.9443293596005033, + "language_loss": 0.66731179, + "learning_rate": 3.2972071429125207e-06, + "loss": 0.68974924, + "num_input_tokens_seen": 52938755, + "step": 2471, + "time_per_iteration": 2.5778756141662598 + }, + { + "auxiliary_loss_clip": 0.01155426, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.05770791, + "balance_loss_mlp": 1.02232766, + "epoch": 0.2972404256598329, + "flos": 22054359208320.0, + "grad_norm": 1.9541774829106588, + "language_loss": 0.8816393, + "learning_rate": 3.2966141486439682e-06, + "loss": 0.90350974, + "num_input_tokens_seen": 52957945, + "step": 2472, + "time_per_iteration": 2.54937744140625 + }, + { + "auxiliary_loss_clip": 0.01134126, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.05253124, + "balance_loss_mlp": 1.02101469, + "epoch": 0.29736066855047194, + "flos": 31978595796480.0, + "grad_norm": 2.4185275375393225, + "language_loss": 0.64247286, + "learning_rate": 3.29602095768008e-06, + "loss": 0.66411865, + "num_input_tokens_seen": 52978460, + "step": 2473, + "time_per_iteration": 2.736271381378174 + }, + { + "auxiliary_loss_clip": 0.01169971, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.0612036, + "balance_loss_mlp": 1.02129173, + "epoch": 0.29748091144111105, + "flos": 33510401245440.0, + "grad_norm": 1.8397306232531603, + "language_loss": 0.63817883, + "learning_rate": 3.2954275701108437e-06, + "loss": 0.66017914, + "num_input_tokens_seen": 52999640, + "step": 2474, + "time_per_iteration": 2.6391453742980957 + }, + { + "auxiliary_loss_clip": 0.01142061, + "auxiliary_loss_mlp": 0.01026019, + "balance_loss_clip": 1.05406857, + "balance_loss_mlp": 1.01678634, + "epoch": 0.29760115433175016, + "flos": 41283373409280.0, + "grad_norm": 2.338121174326109, + "language_loss": 0.68623805, + "learning_rate": 3.294833986026275e-06, + "loss": 0.70791882, + "num_input_tokens_seen": 53022880, + "step": 2475, + "time_per_iteration": 3.6481127738952637 + }, + { + "auxiliary_loss_clip": 0.01153067, + "auxiliary_loss_mlp": 0.01025789, + "balance_loss_clip": 1.05640996, + "balance_loss_mlp": 1.01688433, + "epoch": 0.2977213972223892, + "flos": 24493339572480.0, + "grad_norm": 2.0947412880151806, + "language_loss": 0.85193896, + "learning_rate": 3.29424020551642e-06, + "loss": 0.87372756, + "num_input_tokens_seen": 53041515, + "step": 2476, + "time_per_iteration": 4.150704622268677 + }, + { + "auxiliary_loss_clip": 0.01210018, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.06459832, + "balance_loss_mlp": 1.02477074, + "epoch": 0.2978416401130283, + "flos": 21285116519040.0, + "grad_norm": 1.978533878096831, + "language_loss": 0.72296906, + "learning_rate": 3.2936462286713546e-06, + "loss": 0.74542105, + "num_input_tokens_seen": 53059865, + "step": 2477, + "time_per_iteration": 2.480623245239258 + }, + { + "auxiliary_loss_clip": 0.01191504, + "auxiliary_loss_mlp": 0.00764295, + "balance_loss_clip": 1.06174481, + "balance_loss_mlp": 1.00030088, + "epoch": 0.2979618830036674, + "flos": 25772154554880.0, + "grad_norm": 1.996844815724552, + "language_loss": 0.77014327, + "learning_rate": 3.2930520555811846e-06, + "loss": 0.78970122, + "num_input_tokens_seen": 53079490, + "step": 2478, + "time_per_iteration": 2.5560431480407715 + }, + { + "auxiliary_loss_clip": 0.01095932, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.04948032, + "balance_loss_mlp": 1.02344012, + "epoch": 0.2980821258943065, + "flos": 23476996247040.0, + "grad_norm": 1.6958406021904748, + "language_loss": 0.80130553, + "learning_rate": 3.292457686336046e-06, + "loss": 0.82259488, + "num_input_tokens_seen": 53098810, + "step": 2479, + "time_per_iteration": 2.7030694484710693 + }, + { + "auxiliary_loss_clip": 0.01091854, + "auxiliary_loss_mlp": 0.01009119, + "balance_loss_clip": 1.03518629, + "balance_loss_mlp": 1.00764132, + "epoch": 0.2982023687849456, + "flos": 69752314195200.0, + "grad_norm": 0.874127644883929, + "language_loss": 0.61287028, + "learning_rate": 3.291863121026105e-06, + "loss": 0.63388002, + "num_input_tokens_seen": 53162590, + "step": 2480, + "time_per_iteration": 3.1950089931488037 + }, + { + "auxiliary_loss_clip": 0.01191739, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.06191647, + "balance_loss_mlp": 1.02424145, + "epoch": 0.29832261167558466, + "flos": 29825930741760.0, + "grad_norm": 1.8341941873830745, + "language_loss": 0.76604187, + "learning_rate": 3.2912683597415547e-06, + "loss": 0.78829145, + "num_input_tokens_seen": 53186675, + "step": 2481, + "time_per_iteration": 2.6439967155456543 + }, + { + "auxiliary_loss_clip": 0.011644, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.05876565, + "balance_loss_mlp": 1.02667737, + "epoch": 0.29844285456622377, + "flos": 33910158683520.0, + "grad_norm": 2.051309937754646, + "language_loss": 0.78282851, + "learning_rate": 3.2906734025726213e-06, + "loss": 0.80483246, + "num_input_tokens_seen": 53205940, + "step": 2482, + "time_per_iteration": 2.6849558353424072 + }, + { + "auxiliary_loss_clip": 0.01198728, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.06457233, + "balance_loss_mlp": 1.02949929, + "epoch": 0.2985630974568629, + "flos": 23876933253120.0, + "grad_norm": 1.7798961196721699, + "language_loss": 0.88462925, + "learning_rate": 3.290078249609559e-06, + "loss": 0.90700692, + "num_input_tokens_seen": 53225360, + "step": 2483, + "time_per_iteration": 2.5498769283294678 + }, + { + "auxiliary_loss_clip": 0.01188429, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.06426525, + "balance_loss_mlp": 1.02515769, + "epoch": 0.29868334034750194, + "flos": 21799106184960.0, + "grad_norm": 1.8362350643667995, + "language_loss": 0.87950534, + "learning_rate": 3.2894829009426514e-06, + "loss": 0.9017328, + "num_input_tokens_seen": 53243195, + "step": 2484, + "time_per_iteration": 2.5167906284332275 + }, + { + "auxiliary_loss_clip": 0.01188402, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.06123793, + "balance_loss_mlp": 1.02445829, + "epoch": 0.29880358323814105, + "flos": 25666649331840.0, + "grad_norm": 1.8898556178261638, + "language_loss": 0.77748764, + "learning_rate": 3.288887356662213e-06, + "loss": 0.79970771, + "num_input_tokens_seen": 53264530, + "step": 2485, + "time_per_iteration": 2.5755698680877686 + }, + { + "auxiliary_loss_clip": 0.01094887, + "auxiliary_loss_mlp": 0.01006101, + "balance_loss_clip": 1.03364277, + "balance_loss_mlp": 1.00483787, + "epoch": 0.29892382612878016, + "flos": 71005846003200.0, + "grad_norm": 0.7694142112376068, + "language_loss": 0.59711593, + "learning_rate": 3.288291616858588e-06, + "loss": 0.6181258, + "num_input_tokens_seen": 53319920, + "step": 2486, + "time_per_iteration": 2.9750685691833496 + }, + { + "auxiliary_loss_clip": 0.01141358, + "auxiliary_loss_mlp": 0.01027051, + "balance_loss_clip": 1.05825543, + "balance_loss_mlp": 1.01824117, + "epoch": 0.2990440690194192, + "flos": 25481134563840.0, + "grad_norm": 1.6290586183661782, + "language_loss": 0.7689122, + "learning_rate": 3.287695681622149e-06, + "loss": 0.79059625, + "num_input_tokens_seen": 53339270, + "step": 2487, + "time_per_iteration": 2.6511435508728027 + }, + { + "auxiliary_loss_clip": 0.01179766, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.05895138, + "balance_loss_mlp": 1.01882613, + "epoch": 0.2991643119100583, + "flos": 23732357011200.0, + "grad_norm": 1.9882470204192673, + "language_loss": 0.81252182, + "learning_rate": 3.2870995510432982e-06, + "loss": 0.83459222, + "num_input_tokens_seen": 53357750, + "step": 2488, + "time_per_iteration": 2.5806496143341064 + }, + { + "auxiliary_loss_clip": 0.01182326, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.05971909, + "balance_loss_mlp": 1.02672434, + "epoch": 0.29928455480069743, + "flos": 27417545786880.0, + "grad_norm": 1.7434606052828534, + "language_loss": 0.76765078, + "learning_rate": 3.2865032252124697e-06, + "loss": 0.78982556, + "num_input_tokens_seen": 53378265, + "step": 2489, + "time_per_iteration": 2.5856635570526123 + }, + { + "auxiliary_loss_clip": 0.01174397, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.0585072, + "balance_loss_mlp": 1.02685928, + "epoch": 0.2994047976913365, + "flos": 33692935184640.0, + "grad_norm": 1.3682145206630378, + "language_loss": 0.77613735, + "learning_rate": 3.2859067042201243e-06, + "loss": 0.79824018, + "num_input_tokens_seen": 53400305, + "step": 2490, + "time_per_iteration": 2.6538589000701904 + }, + { + "auxiliary_loss_clip": 0.01110008, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.05195951, + "balance_loss_mlp": 1.02441239, + "epoch": 0.2995250405819756, + "flos": 16763963541120.0, + "grad_norm": 1.9833230117530085, + "language_loss": 0.77764446, + "learning_rate": 3.2853099881567544e-06, + "loss": 0.79907817, + "num_input_tokens_seen": 53418705, + "step": 2491, + "time_per_iteration": 2.609543561935425 + }, + { + "auxiliary_loss_clip": 0.01202512, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.06415343, + "balance_loss_mlp": 1.0265789, + "epoch": 0.29964528347261465, + "flos": 22963976248320.0, + "grad_norm": 2.097809545643159, + "language_loss": 0.79014528, + "learning_rate": 3.284713077112881e-06, + "loss": 0.81251794, + "num_input_tokens_seen": 53438135, + "step": 2492, + "time_per_iteration": 2.5228989124298096 + }, + { + "auxiliary_loss_clip": 0.01169831, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.06279361, + "balance_loss_mlp": 1.02304316, + "epoch": 0.29976552636325376, + "flos": 16938021870720.0, + "grad_norm": 3.0825661009400624, + "language_loss": 0.86485302, + "learning_rate": 3.284115971179056e-06, + "loss": 0.88687849, + "num_input_tokens_seen": 53452165, + "step": 2493, + "time_per_iteration": 2.545217514038086 + }, + { + "auxiliary_loss_clip": 0.01136519, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.05784714, + "balance_loss_mlp": 1.02459359, + "epoch": 0.2998857692538929, + "flos": 17056455989760.0, + "grad_norm": 1.8039814319555105, + "language_loss": 0.78785151, + "learning_rate": 3.283518670445859e-06, + "loss": 0.80955195, + "num_input_tokens_seen": 53470075, + "step": 2494, + "time_per_iteration": 2.604356050491333 + }, + { + "auxiliary_loss_clip": 0.01074994, + "auxiliary_loss_mlp": 0.00753826, + "balance_loss_clip": 1.02911782, + "balance_loss_mlp": 1.00020719, + "epoch": 0.30000601214453193, + "flos": 68831528025600.0, + "grad_norm": 0.6880120674253565, + "language_loss": 0.54308426, + "learning_rate": 3.2829211750038995e-06, + "loss": 0.56137252, + "num_input_tokens_seen": 53538705, + "step": 2495, + "time_per_iteration": 3.1627235412597656 + }, + { + "auxiliary_loss_clip": 0.01156512, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.05592608, + "balance_loss_mlp": 1.02512026, + "epoch": 0.30012625503517104, + "flos": 17603267708160.0, + "grad_norm": 1.9796274089981198, + "language_loss": 0.89079237, + "learning_rate": 3.2823234849438183e-06, + "loss": 0.91269684, + "num_input_tokens_seen": 53556740, + "step": 2496, + "time_per_iteration": 2.5424180030822754 + }, + { + "auxiliary_loss_clip": 0.01177575, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.06031883, + "balance_loss_mlp": 1.02330506, + "epoch": 0.30024649792581015, + "flos": 21252581775360.0, + "grad_norm": 1.9246867939105723, + "language_loss": 0.75855315, + "learning_rate": 3.2817256003562836e-06, + "loss": 0.78065014, + "num_input_tokens_seen": 53577115, + "step": 2497, + "time_per_iteration": 3.4003679752349854 + }, + { + "auxiliary_loss_clip": 0.01134962, + "auxiliary_loss_mlp": 0.01040467, + "balance_loss_clip": 1.05717862, + "balance_loss_mlp": 1.03079939, + "epoch": 0.3003667408164492, + "flos": 23003262748800.0, + "grad_norm": 1.7296803073515479, + "language_loss": 0.66278738, + "learning_rate": 3.281127521331995e-06, + "loss": 0.6845417, + "num_input_tokens_seen": 53598295, + "step": 2498, + "time_per_iteration": 2.653770923614502 + }, + { + "auxiliary_loss_clip": 0.01103264, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.02895892, + "balance_loss_mlp": 1.00180769, + "epoch": 0.3004869837070883, + "flos": 64232340750720.0, + "grad_norm": 0.8835941656506333, + "language_loss": 0.60625434, + "learning_rate": 3.2805292479616798e-06, + "loss": 0.62731689, + "num_input_tokens_seen": 53657160, + "step": 2499, + "time_per_iteration": 2.97001314163208 + }, + { + "auxiliary_loss_clip": 0.01177163, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.05830836, + "balance_loss_mlp": 1.0209167, + "epoch": 0.30060722659772743, + "flos": 26248653400320.0, + "grad_norm": 2.2195220560177997, + "language_loss": 0.91660655, + "learning_rate": 3.2799307803360955e-06, + "loss": 0.9386791, + "num_input_tokens_seen": 53673090, + "step": 2500, + "time_per_iteration": 2.6368207931518555 + }, + { + "auxiliary_loss_clip": 0.01199409, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.06164384, + "balance_loss_mlp": 1.02465725, + "epoch": 0.3007274694883665, + "flos": 24970879912320.0, + "grad_norm": 1.3896059002488632, + "language_loss": 0.8169753, + "learning_rate": 3.27933211854603e-06, + "loss": 0.83930206, + "num_input_tokens_seen": 53692145, + "step": 2501, + "time_per_iteration": 3.4357757568359375 + }, + { + "auxiliary_loss_clip": 0.01176098, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.0614953, + "balance_loss_mlp": 1.02126038, + "epoch": 0.3008477123790056, + "flos": 17055845458560.0, + "grad_norm": 1.5334444339985713, + "language_loss": 0.8710283, + "learning_rate": 3.278733262682299e-06, + "loss": 0.89309108, + "num_input_tokens_seen": 53710000, + "step": 2502, + "time_per_iteration": 2.5217535495758057 + }, + { + "auxiliary_loss_clip": 0.01202946, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.06204009, + "balance_loss_mlp": 1.02104104, + "epoch": 0.3009679552696447, + "flos": 21506398254720.0, + "grad_norm": 2.1236831730283776, + "language_loss": 0.82534826, + "learning_rate": 3.2781342128357484e-06, + "loss": 0.84767532, + "num_input_tokens_seen": 53729355, + "step": 2503, + "time_per_iteration": 4.005939960479736 + }, + { + "auxiliary_loss_clip": 0.01159268, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.05540967, + "balance_loss_mlp": 1.02297807, + "epoch": 0.30108819816028376, + "flos": 21134004001920.0, + "grad_norm": 2.192504855647531, + "language_loss": 0.80452847, + "learning_rate": 3.2775349690972547e-06, + "loss": 0.826446, + "num_input_tokens_seen": 53743505, + "step": 2504, + "time_per_iteration": 2.5541675090789795 + }, + { + "auxiliary_loss_clip": 0.01086458, + "auxiliary_loss_mlp": 0.01005833, + "balance_loss_clip": 1.02872694, + "balance_loss_mlp": 1.0046525, + "epoch": 0.30120844105092287, + "flos": 71126434938240.0, + "grad_norm": 0.7594273797325565, + "language_loss": 0.5185554, + "learning_rate": 3.276935531557722e-06, + "loss": 0.5394783, + "num_input_tokens_seen": 53808725, + "step": 2505, + "time_per_iteration": 3.1689682006835938 + }, + { + "auxiliary_loss_clip": 0.0114669, + "auxiliary_loss_mlp": 0.01035879, + "balance_loss_clip": 1.05268741, + "balance_loss_mlp": 1.02696228, + "epoch": 0.301328683941562, + "flos": 20264571302400.0, + "grad_norm": 2.0878670427861725, + "language_loss": 0.79518831, + "learning_rate": 3.2763359003080837e-06, + "loss": 0.81701398, + "num_input_tokens_seen": 53825680, + "step": 2506, + "time_per_iteration": 2.5947065353393555 + }, + { + "auxiliary_loss_clip": 0.01077126, + "auxiliary_loss_mlp": 0.01004033, + "balance_loss_clip": 1.02478361, + "balance_loss_mlp": 1.00269222, + "epoch": 0.30144892683220104, + "flos": 70648212240000.0, + "grad_norm": 0.7999185397598263, + "language_loss": 0.624529, + "learning_rate": 3.2757360754393047e-06, + "loss": 0.64534056, + "num_input_tokens_seen": 53889750, + "step": 2507, + "time_per_iteration": 3.1939477920532227 + }, + { + "auxiliary_loss_clip": 0.01185691, + "auxiliary_loss_mlp": 0.01028733, + "balance_loss_clip": 1.05850244, + "balance_loss_mlp": 1.01961398, + "epoch": 0.30156916972284015, + "flos": 22820549241600.0, + "grad_norm": 2.5404471931931263, + "language_loss": 0.63833511, + "learning_rate": 3.2751360570423767e-06, + "loss": 0.66047931, + "num_input_tokens_seen": 53908135, + "step": 2508, + "time_per_iteration": 2.5242459774017334 + }, + { + "auxiliary_loss_clip": 0.01173782, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.05881381, + "balance_loss_mlp": 1.02562666, + "epoch": 0.3016894126134792, + "flos": 29899188529920.0, + "grad_norm": 2.0208810286106322, + "language_loss": 0.7583909, + "learning_rate": 3.2745358452083236e-06, + "loss": 0.78047472, + "num_input_tokens_seen": 53931035, + "step": 2509, + "time_per_iteration": 2.669614315032959 + }, + { + "auxiliary_loss_clip": 0.01187899, + "auxiliary_loss_mlp": 0.01028985, + "balance_loss_clip": 1.06250525, + "balance_loss_mlp": 1.02098036, + "epoch": 0.3018096555041183, + "flos": 21546331200000.0, + "grad_norm": 1.3699594928564742, + "language_loss": 0.82284606, + "learning_rate": 3.2739354400281955e-06, + "loss": 0.84501493, + "num_input_tokens_seen": 53952255, + "step": 2510, + "time_per_iteration": 2.6003730297088623 + }, + { + "auxiliary_loss_clip": 0.01065413, + "auxiliary_loss_mlp": 0.00753369, + "balance_loss_clip": 1.02279091, + "balance_loss_mlp": 1.00027013, + "epoch": 0.3019298983947574, + "flos": 59136294597120.0, + "grad_norm": 0.8677804210448475, + "language_loss": 0.63737381, + "learning_rate": 3.2733348415930744e-06, + "loss": 0.65556163, + "num_input_tokens_seen": 54014125, + "step": 2511, + "time_per_iteration": 3.18686842918396 + }, + { + "auxiliary_loss_clip": 0.01153976, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.05634844, + "balance_loss_mlp": 1.02103055, + "epoch": 0.3020501412853965, + "flos": 34423070941440.0, + "grad_norm": 1.753753272570682, + "language_loss": 0.80697787, + "learning_rate": 3.27273404999407e-06, + "loss": 0.82881618, + "num_input_tokens_seen": 54036345, + "step": 2512, + "time_per_iteration": 2.7594552040100098 + }, + { + "auxiliary_loss_clip": 0.01076693, + "auxiliary_loss_mlp": 0.01001928, + "balance_loss_clip": 1.02240872, + "balance_loss_mlp": 1.00051546, + "epoch": 0.3021703841760356, + "flos": 71008288128000.0, + "grad_norm": 0.7945268294047647, + "language_loss": 0.60523999, + "learning_rate": 3.272133065322322e-06, + "loss": 0.62602621, + "num_input_tokens_seen": 54094615, + "step": 2513, + "time_per_iteration": 3.1161978244781494 + }, + { + "auxiliary_loss_clip": 0.01198852, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.05981231, + "balance_loss_mlp": 1.02458715, + "epoch": 0.3022906270666747, + "flos": 21510528318720.0, + "grad_norm": 2.294592755166712, + "language_loss": 0.79291415, + "learning_rate": 3.271531887669e-06, + "loss": 0.81523466, + "num_input_tokens_seen": 54114675, + "step": 2514, + "time_per_iteration": 2.560870885848999 + }, + { + "auxiliary_loss_clip": 0.01146909, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.0526731, + "balance_loss_mlp": 1.0221231, + "epoch": 0.30241086995731375, + "flos": 31132001168640.0, + "grad_norm": 2.3291015231033088, + "language_loss": 0.63726056, + "learning_rate": 3.2709305171253015e-06, + "loss": 0.65904373, + "num_input_tokens_seen": 54134795, + "step": 2515, + "time_per_iteration": 2.691617250442505 + }, + { + "auxiliary_loss_clip": 0.0118586, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.06004691, + "balance_loss_mlp": 1.02132368, + "epoch": 0.30253111284795287, + "flos": 23511542152320.0, + "grad_norm": 1.767714350659697, + "language_loss": 0.7750448, + "learning_rate": 3.2703289537824536e-06, + "loss": 0.79720253, + "num_input_tokens_seen": 54154595, + "step": 2516, + "time_per_iteration": 2.5222325325012207 + }, + { + "auxiliary_loss_clip": 0.01145503, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.05513978, + "balance_loss_mlp": 1.0281949, + "epoch": 0.302651355738592, + "flos": 18725367651840.0, + "grad_norm": 2.5750889189759074, + "language_loss": 0.79021883, + "learning_rate": 3.269727197731714e-06, + "loss": 0.81204736, + "num_input_tokens_seen": 54167360, + "step": 2517, + "time_per_iteration": 2.5560317039489746 + }, + { + "auxiliary_loss_clip": 0.01138605, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.05475283, + "balance_loss_mlp": 1.02629876, + "epoch": 0.30277159862923103, + "flos": 22418888382720.0, + "grad_norm": 1.6179424218917458, + "language_loss": 0.78088784, + "learning_rate": 3.269125249064367e-06, + "loss": 0.8026244, + "num_input_tokens_seen": 54187055, + "step": 2518, + "time_per_iteration": 2.5996103286743164 + }, + { + "auxiliary_loss_clip": 0.01201619, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.06070828, + "balance_loss_mlp": 1.02100289, + "epoch": 0.30289184151987014, + "flos": 22273126992000.0, + "grad_norm": 1.6123060852828135, + "language_loss": 0.83222878, + "learning_rate": 3.2685231078717297e-06, + "loss": 0.85454059, + "num_input_tokens_seen": 54207245, + "step": 2519, + "time_per_iteration": 2.5161924362182617 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.00763591, + "balance_loss_clip": 1.05572081, + "balance_loss_mlp": 1.00041389, + "epoch": 0.30301208441050925, + "flos": 25225594231680.0, + "grad_norm": 2.634598646533618, + "language_loss": 0.75251359, + "learning_rate": 3.267920774245145e-06, + "loss": 0.77163351, + "num_input_tokens_seen": 54226650, + "step": 2520, + "time_per_iteration": 2.594172239303589 + }, + { + "auxiliary_loss_clip": 0.01191567, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.06405401, + "balance_loss_mlp": 1.02931499, + "epoch": 0.3031323273011483, + "flos": 23039245198080.0, + "grad_norm": 2.1235876154280096, + "language_loss": 0.84863114, + "learning_rate": 3.2673182482759876e-06, + "loss": 0.87093663, + "num_input_tokens_seen": 54245765, + "step": 2521, + "time_per_iteration": 2.506431818008423 + }, + { + "auxiliary_loss_clip": 0.01186747, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.06061029, + "balance_loss_mlp": 1.02059186, + "epoch": 0.3032525701917874, + "flos": 18876695650560.0, + "grad_norm": 2.65023783129727, + "language_loss": 0.66362751, + "learning_rate": 3.266715530055659e-06, + "loss": 0.68579012, + "num_input_tokens_seen": 54263915, + "step": 2522, + "time_per_iteration": 2.4898760318756104 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.0578686, + "balance_loss_mlp": 1.02010381, + "epoch": 0.30337281308242653, + "flos": 17782641250560.0, + "grad_norm": 1.5567698222303759, + "language_loss": 0.80538595, + "learning_rate": 3.2661126196755927e-06, + "loss": 0.82745516, + "num_input_tokens_seen": 54283025, + "step": 2523, + "time_per_iteration": 3.3456547260284424 + }, + { + "auxiliary_loss_clip": 0.01093292, + "auxiliary_loss_mlp": 0.0100313, + "balance_loss_clip": 1.02000046, + "balance_loss_mlp": 1.00193238, + "epoch": 0.3034930559730656, + "flos": 57824298426240.0, + "grad_norm": 0.7842039617537705, + "language_loss": 0.55972791, + "learning_rate": 3.265509517227248e-06, + "loss": 0.58069217, + "num_input_tokens_seen": 54339840, + "step": 2524, + "time_per_iteration": 3.0423965454101562 + }, + { + "auxiliary_loss_clip": 0.01173376, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.05697024, + "balance_loss_mlp": 1.02056503, + "epoch": 0.3036132988637047, + "flos": 14755587419520.0, + "grad_norm": 1.6294588722256331, + "language_loss": 0.81412554, + "learning_rate": 3.264906222802115e-06, + "loss": 0.836151, + "num_input_tokens_seen": 54357690, + "step": 2525, + "time_per_iteration": 2.520167589187622 + }, + { + "auxiliary_loss_clip": 0.01203987, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.06117499, + "balance_loss_mlp": 1.02296424, + "epoch": 0.30373354175434375, + "flos": 21033203460480.0, + "grad_norm": 1.963109081989034, + "language_loss": 0.77811176, + "learning_rate": 3.264302736491715e-06, + "loss": 0.80047679, + "num_input_tokens_seen": 54377810, + "step": 2526, + "time_per_iteration": 2.5642902851104736 + }, + { + "auxiliary_loss_clip": 0.0118515, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.06349432, + "balance_loss_mlp": 1.01973176, + "epoch": 0.30385378464498286, + "flos": 21143233797120.0, + "grad_norm": 1.7335452160130531, + "language_loss": 0.87648231, + "learning_rate": 3.263699058387594e-06, + "loss": 0.89861721, + "num_input_tokens_seen": 54395245, + "step": 2527, + "time_per_iteration": 2.5394296646118164 + }, + { + "auxiliary_loss_clip": 0.01152684, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.0527072, + "balance_loss_mlp": 1.02629042, + "epoch": 0.30397402753562197, + "flos": 20629244131200.0, + "grad_norm": 2.1590391549208627, + "language_loss": 0.90146124, + "learning_rate": 3.2630951885813315e-06, + "loss": 0.92334342, + "num_input_tokens_seen": 54412640, + "step": 2528, + "time_per_iteration": 3.4062328338623047 + }, + { + "auxiliary_loss_clip": 0.01170894, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.05523014, + "balance_loss_mlp": 1.02592254, + "epoch": 0.304094270426261, + "flos": 15085678429440.0, + "grad_norm": 2.254779655991013, + "language_loss": 0.78191692, + "learning_rate": 3.262491127164533e-06, + "loss": 0.80397367, + "num_input_tokens_seen": 54431455, + "step": 2529, + "time_per_iteration": 3.35711669921875 + }, + { + "auxiliary_loss_clip": 0.01178079, + "auxiliary_loss_mlp": 0.00763844, + "balance_loss_clip": 1.05872631, + "balance_loss_mlp": 1.00048196, + "epoch": 0.30421451331690014, + "flos": 13845216193920.0, + "grad_norm": 2.3109062809461705, + "language_loss": 0.80576789, + "learning_rate": 3.2618868742288337e-06, + "loss": 0.82518709, + "num_input_tokens_seen": 54448380, + "step": 2530, + "time_per_iteration": 2.5056116580963135 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.05911517, + "balance_loss_mlp": 1.02624011, + "epoch": 0.30433475620753925, + "flos": 17384212615680.0, + "grad_norm": 1.9498125816392136, + "language_loss": 0.721264, + "learning_rate": 3.261282429865899e-06, + "loss": 0.7434535, + "num_input_tokens_seen": 54466385, + "step": 2531, + "time_per_iteration": 2.5009498596191406 + }, + { + "auxiliary_loss_clip": 0.0117875, + "auxiliary_loss_mlp": 0.00762875, + "balance_loss_clip": 1.06110644, + "balance_loss_mlp": 1.00043011, + "epoch": 0.3044549990981783, + "flos": 18916951818240.0, + "grad_norm": 1.7131432541660319, + "language_loss": 0.72472709, + "learning_rate": 3.2606777941674225e-06, + "loss": 0.74414337, + "num_input_tokens_seen": 54485040, + "step": 2532, + "time_per_iteration": 2.523322820663452 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.05527604, + "balance_loss_mlp": 1.02533531, + "epoch": 0.3045752419888174, + "flos": 21068431724160.0, + "grad_norm": 2.048658010233344, + "language_loss": 0.84644771, + "learning_rate": 3.2600729672251276e-06, + "loss": 0.86816043, + "num_input_tokens_seen": 54502755, + "step": 2533, + "time_per_iteration": 2.6052637100219727 + }, + { + "auxiliary_loss_clip": 0.01201848, + "auxiliary_loss_mlp": 0.00764031, + "balance_loss_clip": 1.06364822, + "balance_loss_mlp": 1.00044715, + "epoch": 0.3046954848794565, + "flos": 29096405516160.0, + "grad_norm": 1.957431760527712, + "language_loss": 0.65837675, + "learning_rate": 3.259467949130765e-06, + "loss": 0.67803556, + "num_input_tokens_seen": 54524165, + "step": 2534, + "time_per_iteration": 2.528587579727173 + }, + { + "auxiliary_loss_clip": 0.01175538, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.06147075, + "balance_loss_mlp": 1.0237174, + "epoch": 0.3048157277700956, + "flos": 20295346279680.0, + "grad_norm": 2.717079864066357, + "language_loss": 0.82668656, + "learning_rate": 3.2588627399761164e-06, + "loss": 0.84876847, + "num_input_tokens_seen": 54540160, + "step": 2535, + "time_per_iteration": 2.52421498298645 + }, + { + "auxiliary_loss_clip": 0.01169873, + "auxiliary_loss_mlp": 0.01030812, + "balance_loss_clip": 1.05769825, + "balance_loss_mlp": 1.02286112, + "epoch": 0.3049359706607347, + "flos": 22739929165440.0, + "grad_norm": 1.8297408170458171, + "language_loss": 0.70868313, + "learning_rate": 3.2582573398529903e-06, + "loss": 0.73068994, + "num_input_tokens_seen": 54557515, + "step": 2536, + "time_per_iteration": 2.525496244430542 + }, + { + "auxiliary_loss_clip": 0.01157652, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.05547166, + "balance_loss_mlp": 1.02116156, + "epoch": 0.3050562135513738, + "flos": 18434634969600.0, + "grad_norm": 2.2877642762140815, + "language_loss": 0.73673189, + "learning_rate": 3.2576517488532265e-06, + "loss": 0.75861752, + "num_input_tokens_seen": 54573865, + "step": 2537, + "time_per_iteration": 2.5248055458068848 + }, + { + "auxiliary_loss_clip": 0.01183149, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.05673671, + "balance_loss_mlp": 1.02342308, + "epoch": 0.30517645644201286, + "flos": 20370327920640.0, + "grad_norm": 1.996552654536769, + "language_loss": 0.87468207, + "learning_rate": 3.257045967068692e-06, + "loss": 0.89682972, + "num_input_tokens_seen": 54593120, + "step": 2538, + "time_per_iteration": 2.4772703647613525 + }, + { + "auxiliary_loss_clip": 0.0120401, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.06302202, + "balance_loss_mlp": 1.02708817, + "epoch": 0.30529669933265197, + "flos": 21945118970880.0, + "grad_norm": 1.5150788562623656, + "language_loss": 0.82151115, + "learning_rate": 3.2564399945912848e-06, + "loss": 0.84391892, + "num_input_tokens_seen": 54612910, + "step": 2539, + "time_per_iteration": 2.465324878692627 + }, + { + "auxiliary_loss_clip": 0.01147856, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.0562706, + "balance_loss_mlp": 1.02314401, + "epoch": 0.305416942223291, + "flos": 21835411856640.0, + "grad_norm": 2.112055452973483, + "language_loss": 0.81893671, + "learning_rate": 3.2558338315129287e-06, + "loss": 0.84072852, + "num_input_tokens_seen": 54631055, + "step": 2540, + "time_per_iteration": 2.579378128051758 + }, + { + "auxiliary_loss_clip": 0.0117988, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.05811334, + "balance_loss_mlp": 1.02225316, + "epoch": 0.30553718511393013, + "flos": 33911810709120.0, + "grad_norm": 2.3360721068148647, + "language_loss": 0.75758135, + "learning_rate": 3.2552274779255785e-06, + "loss": 0.77969611, + "num_input_tokens_seen": 54651985, + "step": 2541, + "time_per_iteration": 2.568037509918213 + }, + { + "auxiliary_loss_clip": 0.01188058, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.06177032, + "balance_loss_mlp": 1.0269084, + "epoch": 0.30565742800456924, + "flos": 22268530051200.0, + "grad_norm": 2.816432528797108, + "language_loss": 0.7695508, + "learning_rate": 3.2546209339212184e-06, + "loss": 0.7917887, + "num_input_tokens_seen": 54671005, + "step": 2542, + "time_per_iteration": 2.503436326980591 + }, + { + "auxiliary_loss_clip": 0.01174199, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.05772519, + "balance_loss_mlp": 1.02360773, + "epoch": 0.3057776708952083, + "flos": 22565044823040.0, + "grad_norm": 4.817841391028305, + "language_loss": 0.77093232, + "learning_rate": 3.25401419959186e-06, + "loss": 0.79300356, + "num_input_tokens_seen": 54691615, + "step": 2543, + "time_per_iteration": 2.577075242996216 + }, + { + "auxiliary_loss_clip": 0.01184035, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.06369662, + "balance_loss_mlp": 1.03090477, + "epoch": 0.3058979137858474, + "flos": 21799213925760.0, + "grad_norm": 1.985366899540621, + "language_loss": 0.76251411, + "learning_rate": 3.253407275029545e-06, + "loss": 0.78475845, + "num_input_tokens_seen": 54710520, + "step": 2544, + "time_per_iteration": 2.552225112915039 + }, + { + "auxiliary_loss_clip": 0.01164375, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.06153047, + "balance_loss_mlp": 1.02136254, + "epoch": 0.3060181566764865, + "flos": 26979435601920.0, + "grad_norm": 3.547934076312213, + "language_loss": 0.79899305, + "learning_rate": 3.2528001603263425e-06, + "loss": 0.82095122, + "num_input_tokens_seen": 54732590, + "step": 2545, + "time_per_iteration": 2.676417112350464 + }, + { + "auxiliary_loss_clip": 0.01189625, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.06468487, + "balance_loss_mlp": 1.02368116, + "epoch": 0.3061383995671256, + "flos": 19865101173120.0, + "grad_norm": 1.7584696023057218, + "language_loss": 0.81316751, + "learning_rate": 3.2521928555743514e-06, + "loss": 0.83539128, + "num_input_tokens_seen": 54749935, + "step": 2546, + "time_per_iteration": 2.502725124359131 + }, + { + "auxiliary_loss_clip": 0.01165588, + "auxiliary_loss_mlp": 0.00764089, + "balance_loss_clip": 1.05534077, + "balance_loss_mlp": 1.0003953, + "epoch": 0.3062586424577647, + "flos": 22127509255680.0, + "grad_norm": 1.7343426097250114, + "language_loss": 0.67941946, + "learning_rate": 3.2515853608657e-06, + "loss": 0.69871622, + "num_input_tokens_seen": 54767935, + "step": 2547, + "time_per_iteration": 2.6365807056427 + }, + { + "auxiliary_loss_clip": 0.01183558, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.06005025, + "balance_loss_mlp": 1.02557516, + "epoch": 0.3063788853484038, + "flos": 20845497962880.0, + "grad_norm": 1.9789148819918887, + "language_loss": 0.75077724, + "learning_rate": 3.250977676292545e-06, + "loss": 0.77295965, + "num_input_tokens_seen": 54786175, + "step": 2548, + "time_per_iteration": 2.62123966217041 + }, + { + "auxiliary_loss_clip": 0.01176399, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.05896759, + "balance_loss_mlp": 1.01921952, + "epoch": 0.30649912823904285, + "flos": 16209717707520.0, + "grad_norm": 2.1408589079634472, + "language_loss": 0.79484624, + "learning_rate": 3.2503698019470712e-06, + "loss": 0.8168925, + "num_input_tokens_seen": 54801945, + "step": 2549, + "time_per_iteration": 3.4255385398864746 + }, + { + "auxiliary_loss_clip": 0.01185958, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.05819559, + "balance_loss_mlp": 1.02554846, + "epoch": 0.30661937112968196, + "flos": 18617815353600.0, + "grad_norm": 2.9209145172391393, + "language_loss": 0.77924049, + "learning_rate": 3.249761737921492e-06, + "loss": 0.80145496, + "num_input_tokens_seen": 54818475, + "step": 2550, + "time_per_iteration": 2.5074427127838135 + }, + { + "auxiliary_loss_clip": 0.01172711, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.06199098, + "balance_loss_mlp": 1.0304476, + "epoch": 0.30673961402032107, + "flos": 31390809638400.0, + "grad_norm": 1.930094948696054, + "language_loss": 0.74417871, + "learning_rate": 3.249153484308051e-06, + "loss": 0.76630116, + "num_input_tokens_seen": 54837090, + "step": 2551, + "time_per_iteration": 2.6570584774017334 + }, + { + "auxiliary_loss_clip": 0.0113492, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.05287671, + "balance_loss_mlp": 1.02201533, + "epoch": 0.3068598569109601, + "flos": 20229809915520.0, + "grad_norm": 1.8974327612896555, + "language_loss": 0.78112525, + "learning_rate": 3.2485450411990194e-06, + "loss": 0.8027873, + "num_input_tokens_seen": 54856445, + "step": 2552, + "time_per_iteration": 2.609426736831665 + }, + { + "auxiliary_loss_clip": 0.01203668, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.06156111, + "balance_loss_mlp": 1.02482295, + "epoch": 0.30698009980159924, + "flos": 29601991399680.0, + "grad_norm": 1.67458772363467, + "language_loss": 0.82118469, + "learning_rate": 3.2479364086866983e-06, + "loss": 0.84356302, + "num_input_tokens_seen": 54876700, + "step": 2553, + "time_per_iteration": 2.568513870239258 + }, + { + "auxiliary_loss_clip": 0.01175701, + "auxiliary_loss_mlp": 0.00764556, + "balance_loss_clip": 1.06309676, + "balance_loss_mlp": 1.00036275, + "epoch": 0.30710034269223835, + "flos": 23842423261440.0, + "grad_norm": 2.0202817862949427, + "language_loss": 0.81202972, + "learning_rate": 3.247327586863416e-06, + "loss": 0.83143228, + "num_input_tokens_seen": 54897580, + "step": 2554, + "time_per_iteration": 2.635573387145996 + }, + { + "auxiliary_loss_clip": 0.01164287, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.05838156, + "balance_loss_mlp": 1.02221262, + "epoch": 0.3072205855828774, + "flos": 25884986152320.0, + "grad_norm": 2.5539249194740923, + "language_loss": 0.76973808, + "learning_rate": 3.2467185758215304e-06, + "loss": 0.79169923, + "num_input_tokens_seen": 54917320, + "step": 2555, + "time_per_iteration": 4.326486110687256 + }, + { + "auxiliary_loss_clip": 0.01164073, + "auxiliary_loss_mlp": 0.00764421, + "balance_loss_clip": 1.06065357, + "balance_loss_mlp": 1.0003407, + "epoch": 0.3073408284735165, + "flos": 22236390357120.0, + "grad_norm": 2.3399816496710737, + "language_loss": 0.85378468, + "learning_rate": 3.246109375653428e-06, + "loss": 0.87306958, + "num_input_tokens_seen": 54934085, + "step": 2556, + "time_per_iteration": 2.566512107849121 + }, + { + "auxiliary_loss_clip": 0.01202081, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.06287003, + "balance_loss_mlp": 1.02345324, + "epoch": 0.30746107136415557, + "flos": 19500284689920.0, + "grad_norm": 1.8613564569188183, + "language_loss": 0.78450274, + "learning_rate": 3.2454999864515243e-06, + "loss": 0.80685258, + "num_input_tokens_seen": 54953460, + "step": 2557, + "time_per_iteration": 2.485262632369995 + }, + { + "auxiliary_loss_clip": 0.01169424, + "auxiliary_loss_mlp": 0.00764566, + "balance_loss_clip": 1.05856824, + "balance_loss_mlp": 1.00033796, + "epoch": 0.3075813142547947, + "flos": 21724806902400.0, + "grad_norm": 1.9734679764863288, + "language_loss": 0.69481289, + "learning_rate": 3.244890408308263e-06, + "loss": 0.71415275, + "num_input_tokens_seen": 54974165, + "step": 2558, + "time_per_iteration": 2.5684025287628174 + }, + { + "auxiliary_loss_clip": 0.01143103, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.0528121, + "balance_loss_mlp": 1.01857102, + "epoch": 0.3077015571454338, + "flos": 24097963593600.0, + "grad_norm": 2.0964095580441033, + "language_loss": 0.61549819, + "learning_rate": 3.2442806413161165e-06, + "loss": 0.63720381, + "num_input_tokens_seen": 54993810, + "step": 2559, + "time_per_iteration": 2.6360771656036377 + }, + { + "auxiliary_loss_clip": 0.01145844, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.05490255, + "balance_loss_mlp": 1.02663326, + "epoch": 0.30782180003607285, + "flos": 18405476104320.0, + "grad_norm": 2.262498332346023, + "language_loss": 0.75669622, + "learning_rate": 3.243670685567586e-06, + "loss": 0.77851838, + "num_input_tokens_seen": 55011210, + "step": 2560, + "time_per_iteration": 2.574000835418701 + }, + { + "auxiliary_loss_clip": 0.01167786, + "auxiliary_loss_mlp": 0.00763195, + "balance_loss_clip": 1.05660057, + "balance_loss_mlp": 1.000337, + "epoch": 0.30794204292671196, + "flos": 23878549365120.0, + "grad_norm": 2.2030241756908877, + "language_loss": 0.80321157, + "learning_rate": 3.2430605411552012e-06, + "loss": 0.82252139, + "num_input_tokens_seen": 55031325, + "step": 2561, + "time_per_iteration": 2.574800968170166 + }, + { + "auxiliary_loss_clip": 0.0106763, + "auxiliary_loss_mlp": 0.01002978, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.00163066, + "epoch": 0.30806228581735107, + "flos": 67927800816000.0, + "grad_norm": 0.895991085028544, + "language_loss": 0.70571971, + "learning_rate": 3.2424502081715205e-06, + "loss": 0.72642577, + "num_input_tokens_seen": 55094440, + "step": 2562, + "time_per_iteration": 3.1551928520202637 + }, + { + "auxiliary_loss_clip": 0.01173007, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.05844688, + "balance_loss_mlp": 1.02624738, + "epoch": 0.3081825287079901, + "flos": 23843213360640.0, + "grad_norm": 1.7024109787593373, + "language_loss": 0.78258002, + "learning_rate": 3.241839686709132e-06, + "loss": 0.80466652, + "num_input_tokens_seen": 55115375, + "step": 2563, + "time_per_iteration": 2.5709052085876465 + }, + { + "auxiliary_loss_clip": 0.01182179, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.05560935, + "balance_loss_mlp": 1.02456093, + "epoch": 0.30830277159862923, + "flos": 16209969102720.0, + "grad_norm": 2.2087492961752875, + "language_loss": 0.81828892, + "learning_rate": 3.2412289768606495e-06, + "loss": 0.84044921, + "num_input_tokens_seen": 55131945, + "step": 2564, + "time_per_iteration": 2.4789037704467773 + }, + { + "auxiliary_loss_clip": 0.01188191, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.06065464, + "balance_loss_mlp": 1.02835298, + "epoch": 0.30842301448926834, + "flos": 29349503723520.0, + "grad_norm": 1.7893080932667713, + "language_loss": 0.82809997, + "learning_rate": 3.240618078718718e-06, + "loss": 0.85035169, + "num_input_tokens_seen": 55153405, + "step": 2565, + "time_per_iteration": 2.6351876258850098 + }, + { + "auxiliary_loss_clip": 0.01153988, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.05391157, + "balance_loss_mlp": 1.02522802, + "epoch": 0.3085432573799074, + "flos": 21945190798080.0, + "grad_norm": 2.7851221701190694, + "language_loss": 0.74114138, + "learning_rate": 3.240006992376011e-06, + "loss": 0.7630266, + "num_input_tokens_seen": 55173030, + "step": 2566, + "time_per_iteration": 2.579993724822998 + }, + { + "auxiliary_loss_clip": 0.01175019, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.05949759, + "balance_loss_mlp": 1.03106689, + "epoch": 0.3086635002705465, + "flos": 22054718344320.0, + "grad_norm": 2.198744559276731, + "language_loss": 0.75980198, + "learning_rate": 3.2393957179252284e-06, + "loss": 0.78195465, + "num_input_tokens_seen": 55189565, + "step": 2567, + "time_per_iteration": 2.548823833465576 + }, + { + "auxiliary_loss_clip": 0.0119968, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.06107259, + "balance_loss_mlp": 1.02361417, + "epoch": 0.3087837431611856, + "flos": 32665925520000.0, + "grad_norm": 2.21318109491651, + "language_loss": 0.8092196, + "learning_rate": 3.2387842554591016e-06, + "loss": 0.83153862, + "num_input_tokens_seen": 55210380, + "step": 2568, + "time_per_iteration": 2.575373411178589 + }, + { + "auxiliary_loss_clip": 0.0119888, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.0610764, + "balance_loss_mlp": 1.02949309, + "epoch": 0.3089039860518247, + "flos": 17599245384960.0, + "grad_norm": 2.0483665100440356, + "language_loss": 0.87769192, + "learning_rate": 3.238172605070388e-06, + "loss": 0.90006578, + "num_input_tokens_seen": 55225795, + "step": 2569, + "time_per_iteration": 2.4413695335388184 + }, + { + "auxiliary_loss_clip": 0.01182767, + "auxiliary_loss_mlp": 0.00764516, + "balance_loss_clip": 1.05796194, + "balance_loss_mlp": 1.00036359, + "epoch": 0.3090242289424638, + "flos": 14383839611520.0, + "grad_norm": 2.2588303445951907, + "language_loss": 0.78495175, + "learning_rate": 3.2375607668518745e-06, + "loss": 0.80442452, + "num_input_tokens_seen": 55238830, + "step": 2570, + "time_per_iteration": 2.4739160537719727 + }, + { + "auxiliary_loss_clip": 0.01161531, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.05574048, + "balance_loss_mlp": 1.02392864, + "epoch": 0.30914447183310284, + "flos": 16068625084800.0, + "grad_norm": 2.4154320674368708, + "language_loss": 0.90108687, + "learning_rate": 3.236948740896377e-06, + "loss": 0.923033, + "num_input_tokens_seen": 55253630, + "step": 2571, + "time_per_iteration": 2.5089807510375977 + }, + { + "auxiliary_loss_clip": 0.01184009, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.05975842, + "balance_loss_mlp": 1.02452934, + "epoch": 0.30926471472374195, + "flos": 32230221546240.0, + "grad_norm": 1.3806592437244731, + "language_loss": 0.84018606, + "learning_rate": 3.2363365272967384e-06, + "loss": 0.86235881, + "num_input_tokens_seen": 55276200, + "step": 2572, + "time_per_iteration": 2.575801134109497 + }, + { + "auxiliary_loss_clip": 0.01182971, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.06215227, + "balance_loss_mlp": 1.02581048, + "epoch": 0.30938495761438106, + "flos": 20370722970240.0, + "grad_norm": 1.8761373595849862, + "language_loss": 0.81487721, + "learning_rate": 3.235724126145832e-06, + "loss": 0.8370685, + "num_input_tokens_seen": 55292235, + "step": 2573, + "time_per_iteration": 2.4904286861419678 + }, + { + "auxiliary_loss_clip": 0.01174138, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.05594969, + "balance_loss_mlp": 1.02745676, + "epoch": 0.3095052005050201, + "flos": 24061155131520.0, + "grad_norm": 1.493099705913763, + "language_loss": 0.77647698, + "learning_rate": 3.235111537536558e-06, + "loss": 0.79858696, + "num_input_tokens_seen": 55313050, + "step": 2574, + "time_per_iteration": 2.524824619293213 + }, + { + "auxiliary_loss_clip": 0.01186069, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.06078708, + "balance_loss_mlp": 1.01811147, + "epoch": 0.30962544339565923, + "flos": 23401547729280.0, + "grad_norm": 1.8130970592096554, + "language_loss": 0.8289758, + "learning_rate": 3.2344987615618456e-06, + "loss": 0.8511039, + "num_input_tokens_seen": 55332885, + "step": 2575, + "time_per_iteration": 2.562351942062378 + }, + { + "auxiliary_loss_clip": 0.01153689, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.05712855, + "balance_loss_mlp": 1.03079736, + "epoch": 0.30974568628629834, + "flos": 33799984692480.0, + "grad_norm": 1.5430371627268409, + "language_loss": 0.78230202, + "learning_rate": 3.2338857983146533e-06, + "loss": 0.80423671, + "num_input_tokens_seen": 55354385, + "step": 2576, + "time_per_iteration": 2.6436009407043457 + }, + { + "auxiliary_loss_clip": 0.01159953, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.05686581, + "balance_loss_mlp": 1.01823258, + "epoch": 0.3098659291769374, + "flos": 20229594433920.0, + "grad_norm": 1.7415915432371871, + "language_loss": 0.76381147, + "learning_rate": 3.233272647887966e-06, + "loss": 0.78569055, + "num_input_tokens_seen": 55373275, + "step": 2577, + "time_per_iteration": 3.376648187637329 + }, + { + "auxiliary_loss_clip": 0.01201486, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.06256604, + "balance_loss_mlp": 1.02931213, + "epoch": 0.3099861720675765, + "flos": 24748556682240.0, + "grad_norm": 1.5511575236520265, + "language_loss": 0.8996799, + "learning_rate": 3.2326593103747985e-06, + "loss": 0.92207867, + "num_input_tokens_seen": 55392290, + "step": 2578, + "time_per_iteration": 2.5045881271362305 + }, + { + "auxiliary_loss_clip": 0.01182392, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.06043911, + "balance_loss_mlp": 1.02469134, + "epoch": 0.3101064149582156, + "flos": 11765485704960.0, + "grad_norm": 3.4427079003615595, + "language_loss": 0.85038143, + "learning_rate": 3.2320457858681936e-06, + "loss": 0.87254196, + "num_input_tokens_seen": 55410680, + "step": 2579, + "time_per_iteration": 2.4898135662078857 + }, + { + "auxiliary_loss_clip": 0.01169894, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.05594587, + "balance_loss_mlp": 1.0181253, + "epoch": 0.31022665784885467, + "flos": 23033247626880.0, + "grad_norm": 2.0790316675104306, + "language_loss": 0.85322726, + "learning_rate": 3.2314320744612228e-06, + "loss": 0.87519616, + "num_input_tokens_seen": 55425980, + "step": 2580, + "time_per_iteration": 2.5371475219726562 + }, + { + "auxiliary_loss_clip": 0.0118271, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.06065238, + "balance_loss_mlp": 1.02134824, + "epoch": 0.3103469007394938, + "flos": 16289188548480.0, + "grad_norm": 1.6085496797856134, + "language_loss": 0.76590157, + "learning_rate": 3.2308181762469854e-06, + "loss": 0.78802741, + "num_input_tokens_seen": 55443925, + "step": 2581, + "time_per_iteration": 2.503220558166504 + }, + { + "auxiliary_loss_clip": 0.01200649, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.0603869, + "balance_loss_mlp": 1.02676368, + "epoch": 0.3104671436301329, + "flos": 30515271626880.0, + "grad_norm": 2.1078312850665246, + "language_loss": 0.78558457, + "learning_rate": 3.230204091318609e-06, + "loss": 0.80794752, + "num_input_tokens_seen": 55464465, + "step": 2582, + "time_per_iteration": 4.275298833847046 + }, + { + "auxiliary_loss_clip": 0.01196594, + "auxiliary_loss_mlp": 0.00762884, + "balance_loss_clip": 1.0596447, + "balance_loss_mlp": 1.00037193, + "epoch": 0.31058738652077195, + "flos": 20047240062720.0, + "grad_norm": 1.7650193911695984, + "language_loss": 0.84690374, + "learning_rate": 3.2295898197692503e-06, + "loss": 0.86649853, + "num_input_tokens_seen": 55483425, + "step": 2583, + "time_per_iteration": 2.4883065223693848 + }, + { + "auxiliary_loss_clip": 0.01197684, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.05907989, + "balance_loss_mlp": 1.02538133, + "epoch": 0.31070762941141106, + "flos": 28074639237120.0, + "grad_norm": 1.6860276225057864, + "language_loss": 0.78884518, + "learning_rate": 3.228975361692094e-06, + "loss": 0.81115985, + "num_input_tokens_seen": 55504445, + "step": 2584, + "time_per_iteration": 2.5420374870300293 + }, + { + "auxiliary_loss_clip": 0.0119043, + "auxiliary_loss_mlp": 0.00764102, + "balance_loss_clip": 1.05986571, + "balance_loss_mlp": 1.00039804, + "epoch": 0.31082787230205017, + "flos": 20521907314560.0, + "grad_norm": 2.476101687890319, + "language_loss": 0.80007064, + "learning_rate": 3.228360717180352e-06, + "loss": 0.81961596, + "num_input_tokens_seen": 55521970, + "step": 2585, + "time_per_iteration": 2.5032527446746826 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.00753815, + "balance_loss_clip": 1.02352905, + "balance_loss_mlp": 1.00082803, + "epoch": 0.3109481151926892, + "flos": 62445928723200.0, + "grad_norm": 0.8243884665032329, + "language_loss": 0.59454364, + "learning_rate": 3.227745886327266e-06, + "loss": 0.61304194, + "num_input_tokens_seen": 55580665, + "step": 2586, + "time_per_iteration": 3.005657434463501 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.01005528, + "balance_loss_clip": 1.02343237, + "balance_loss_mlp": 1.00436008, + "epoch": 0.31106835808332833, + "flos": 44746744723200.0, + "grad_norm": 0.80831643848649, + "language_loss": 0.55916452, + "learning_rate": 3.227130869226105e-06, + "loss": 0.58017826, + "num_input_tokens_seen": 55637825, + "step": 2587, + "time_per_iteration": 3.029630661010742 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01023639, + "balance_loss_clip": 1.05678034, + "balance_loss_mlp": 1.01563406, + "epoch": 0.3111886009739674, + "flos": 23403056100480.0, + "grad_norm": 2.407839259251215, + "language_loss": 0.83063811, + "learning_rate": 3.226515665970167e-06, + "loss": 0.85269111, + "num_input_tokens_seen": 55655365, + "step": 2588, + "time_per_iteration": 2.5212247371673584 + }, + { + "auxiliary_loss_clip": 0.01182968, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.0575577, + "balance_loss_mlp": 1.0193181, + "epoch": 0.3113088438646065, + "flos": 17530728192000.0, + "grad_norm": 2.3574393718216236, + "language_loss": 0.85905403, + "learning_rate": 3.225900276652777e-06, + "loss": 0.8811692, + "num_input_tokens_seen": 55672140, + "step": 2589, + "time_per_iteration": 2.507843017578125 + }, + { + "auxiliary_loss_clip": 0.01174194, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.05657244, + "balance_loss_mlp": 1.02644444, + "epoch": 0.3114290867552456, + "flos": 28365802882560.0, + "grad_norm": 1.711456013896892, + "language_loss": 0.75411683, + "learning_rate": 3.2252847013672906e-06, + "loss": 0.77620929, + "num_input_tokens_seen": 55694800, + "step": 2590, + "time_per_iteration": 2.6080386638641357 + }, + { + "auxiliary_loss_clip": 0.01145343, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.05118597, + "balance_loss_mlp": 1.01910901, + "epoch": 0.31154932964588467, + "flos": 27379157126400.0, + "grad_norm": 1.9189915256378762, + "language_loss": 0.76221371, + "learning_rate": 3.224668940207089e-06, + "loss": 0.7839461, + "num_input_tokens_seen": 55713785, + "step": 2591, + "time_per_iteration": 2.7283730506896973 + }, + { + "auxiliary_loss_clip": 0.01129153, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.04771507, + "balance_loss_mlp": 1.02913737, + "epoch": 0.3116695725365238, + "flos": 26541864120960.0, + "grad_norm": 1.7434602583579224, + "language_loss": 0.86475694, + "learning_rate": 3.2240529932655828e-06, + "loss": 0.88643205, + "num_input_tokens_seen": 55733050, + "step": 2592, + "time_per_iteration": 2.645447015762329 + }, + { + "auxiliary_loss_clip": 0.01165464, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.05661762, + "balance_loss_mlp": 1.02818441, + "epoch": 0.3117898154271629, + "flos": 21177600134400.0, + "grad_norm": 2.420129463294857, + "language_loss": 0.88537049, + "learning_rate": 3.223436860636211e-06, + "loss": 0.90739703, + "num_input_tokens_seen": 55748685, + "step": 2593, + "time_per_iteration": 2.565955400466919 + }, + { + "auxiliary_loss_clip": 0.01198644, + "auxiliary_loss_mlp": 0.01037605, + "balance_loss_clip": 1.06061149, + "balance_loss_mlp": 1.02911162, + "epoch": 0.31191005831780194, + "flos": 27272430840960.0, + "grad_norm": 1.6335165093259305, + "language_loss": 0.7407468, + "learning_rate": 3.2228205424124403e-06, + "loss": 0.76310927, + "num_input_tokens_seen": 55771840, + "step": 2594, + "time_per_iteration": 2.57686710357666 + }, + { + "auxiliary_loss_clip": 0.01155782, + "auxiliary_loss_mlp": 0.01025274, + "balance_loss_clip": 1.05371058, + "balance_loss_mlp": 1.01619005, + "epoch": 0.31203030120844105, + "flos": 12963501043200.0, + "grad_norm": 2.4080262192992126, + "language_loss": 0.74761307, + "learning_rate": 3.222204038687765e-06, + "loss": 0.7694236, + "num_input_tokens_seen": 55784975, + "step": 2595, + "time_per_iteration": 2.5144760608673096 + }, + { + "auxiliary_loss_clip": 0.01178774, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.05625701, + "balance_loss_mlp": 1.02026248, + "epoch": 0.31215054409908016, + "flos": 27562014288000.0, + "grad_norm": 1.7144754742947774, + "language_loss": 0.87691414, + "learning_rate": 3.221587349555709e-06, + "loss": 0.8989861, + "num_input_tokens_seen": 55805235, + "step": 2596, + "time_per_iteration": 2.611163854598999 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.05747008, + "balance_loss_mlp": 1.01781225, + "epoch": 0.3122707869897192, + "flos": 21506326427520.0, + "grad_norm": 1.6947963425357557, + "language_loss": 0.69178277, + "learning_rate": 3.2209704751098236e-06, + "loss": 0.71378362, + "num_input_tokens_seen": 55824265, + "step": 2597, + "time_per_iteration": 2.5642731189727783 + }, + { + "auxiliary_loss_clip": 0.01172083, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.05868542, + "balance_loss_mlp": 1.02103162, + "epoch": 0.31239102988035833, + "flos": 15187017674880.0, + "grad_norm": 2.015586882940876, + "language_loss": 0.82808524, + "learning_rate": 3.2203534154436875e-06, + "loss": 0.85010588, + "num_input_tokens_seen": 55838620, + "step": 2598, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.0112323, + "auxiliary_loss_mlp": 0.01040877, + "balance_loss_clip": 1.0518713, + "balance_loss_mlp": 1.03202569, + "epoch": 0.31251127277099744, + "flos": 22053712763520.0, + "grad_norm": 1.9494711890694392, + "language_loss": 0.75375074, + "learning_rate": 3.2197361706509084e-06, + "loss": 0.77539182, + "num_input_tokens_seen": 55859375, + "step": 2599, + "time_per_iteration": 2.629678964614868 + }, + { + "auxiliary_loss_clip": 0.01201196, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.05931282, + "balance_loss_mlp": 1.02324128, + "epoch": 0.3126315156616365, + "flos": 15193984913280.0, + "grad_norm": 2.536284799310931, + "language_loss": 0.8332063, + "learning_rate": 3.2191187408251228e-06, + "loss": 0.8555491, + "num_input_tokens_seen": 55876535, + "step": 2600, + "time_per_iteration": 2.469064950942993 + }, + { + "auxiliary_loss_clip": 0.01190933, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.05906367, + "balance_loss_mlp": 1.02470016, + "epoch": 0.3127517585522756, + "flos": 18145338831360.0, + "grad_norm": 2.0636303833563083, + "language_loss": 0.7878629, + "learning_rate": 3.218501126059993e-06, + "loss": 0.81011915, + "num_input_tokens_seen": 55891930, + "step": 2601, + "time_per_iteration": 2.456859588623047 + }, + { + "auxiliary_loss_clip": 0.01186845, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.05746675, + "balance_loss_mlp": 1.01906276, + "epoch": 0.31287200144291466, + "flos": 21908633731200.0, + "grad_norm": 1.9752061429597036, + "language_loss": 0.81559646, + "learning_rate": 3.2178833264492116e-06, + "loss": 0.83774841, + "num_input_tokens_seen": 55910635, + "step": 2602, + "time_per_iteration": 2.511857271194458 + }, + { + "auxiliary_loss_clip": 0.01193463, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.06080794, + "balance_loss_mlp": 1.01791739, + "epoch": 0.31299224433355377, + "flos": 29896997800320.0, + "grad_norm": 2.0025992454860844, + "language_loss": 0.76295877, + "learning_rate": 3.217265342086498e-06, + "loss": 0.78516448, + "num_input_tokens_seen": 55931125, + "step": 2603, + "time_per_iteration": 3.4304370880126953 + }, + { + "auxiliary_loss_clip": 0.01161095, + "auxiliary_loss_mlp": 0.00765278, + "balance_loss_clip": 1.05863607, + "balance_loss_mlp": 1.00053036, + "epoch": 0.3131124872241929, + "flos": 11655886331520.0, + "grad_norm": 4.3125303152270975, + "language_loss": 0.72939312, + "learning_rate": 3.216647173065599e-06, + "loss": 0.74865687, + "num_input_tokens_seen": 55946590, + "step": 2604, + "time_per_iteration": 2.5584330558776855 + }, + { + "auxiliary_loss_clip": 0.01169084, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.05905271, + "balance_loss_mlp": 1.02479219, + "epoch": 0.31323273011483194, + "flos": 49848785470080.0, + "grad_norm": 1.8110611455828947, + "language_loss": 0.73628491, + "learning_rate": 3.216028819480292e-06, + "loss": 0.75831854, + "num_input_tokens_seen": 55967930, + "step": 2605, + "time_per_iteration": 2.773589849472046 + }, + { + "auxiliary_loss_clip": 0.01157006, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.0549798, + "balance_loss_mlp": 1.02457714, + "epoch": 0.31335297300547105, + "flos": 22601278667520.0, + "grad_norm": 3.366422634366582, + "language_loss": 0.75332034, + "learning_rate": 3.2154102814243793e-06, + "loss": 0.77522898, + "num_input_tokens_seen": 55987070, + "step": 2606, + "time_per_iteration": 2.541259765625 + }, + { + "auxiliary_loss_clip": 0.01160178, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.05788648, + "balance_loss_mlp": 1.02585053, + "epoch": 0.31347321589611016, + "flos": 34710858708480.0, + "grad_norm": 2.3671053727266798, + "language_loss": 0.66825628, + "learning_rate": 3.2147915589916937e-06, + "loss": 0.69020343, + "num_input_tokens_seen": 56008630, + "step": 2607, + "time_per_iteration": 2.675459861755371 + }, + { + "auxiliary_loss_clip": 0.0116368, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.05455661, + "balance_loss_mlp": 1.02198386, + "epoch": 0.3135934587867492, + "flos": 19755789108480.0, + "grad_norm": 2.367441191309818, + "language_loss": 0.8307637, + "learning_rate": 3.2141726522760938e-06, + "loss": 0.85271144, + "num_input_tokens_seen": 56026690, + "step": 2608, + "time_per_iteration": 4.257196664810181 + }, + { + "auxiliary_loss_clip": 0.01080903, + "auxiliary_loss_mlp": 0.01003236, + "balance_loss_clip": 1.02172875, + "balance_loss_mlp": 1.00213897, + "epoch": 0.3137137016773883, + "flos": 65815535583360.0, + "grad_norm": 0.7093106504270107, + "language_loss": 0.52659452, + "learning_rate": 3.213553561371469e-06, + "loss": 0.54743588, + "num_input_tokens_seen": 56090425, + "step": 2609, + "time_per_iteration": 3.9358878135681152 + }, + { + "auxiliary_loss_clip": 0.01138251, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.05418134, + "balance_loss_mlp": 1.02505767, + "epoch": 0.31383394456802743, + "flos": 16252739222400.0, + "grad_norm": 2.1073021052085177, + "language_loss": 0.95681918, + "learning_rate": 3.212934286371733e-06, + "loss": 0.97853661, + "num_input_tokens_seen": 56107135, + "step": 2610, + "time_per_iteration": 2.6059062480926514 + }, + { + "auxiliary_loss_clip": 0.01187071, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.06295073, + "balance_loss_mlp": 1.01986957, + "epoch": 0.3139541874586665, + "flos": 38795517613440.0, + "grad_norm": 2.136827029398247, + "language_loss": 0.830742, + "learning_rate": 3.2123148273708304e-06, + "loss": 0.85290515, + "num_input_tokens_seen": 56127325, + "step": 2611, + "time_per_iteration": 2.7035458087921143 + }, + { + "auxiliary_loss_clip": 0.01197998, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.06103134, + "balance_loss_mlp": 1.01931071, + "epoch": 0.3140744303493056, + "flos": 25046328430080.0, + "grad_norm": 3.679265695046637, + "language_loss": 0.7698766, + "learning_rate": 3.211695184462733e-06, + "loss": 0.79214579, + "num_input_tokens_seen": 56148500, + "step": 2612, + "time_per_iteration": 2.5048346519470215 + }, + { + "auxiliary_loss_clip": 0.01063136, + "auxiliary_loss_mlp": 0.01003309, + "balance_loss_clip": 1.02280474, + "balance_loss_mlp": 1.002159, + "epoch": 0.3141946732399447, + "flos": 72504254782080.0, + "grad_norm": 0.8938898730384528, + "language_loss": 0.60477912, + "learning_rate": 3.2110753577414383e-06, + "loss": 0.62544358, + "num_input_tokens_seen": 56210080, + "step": 2613, + "time_per_iteration": 3.12212872505188 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.05562329, + "balance_loss_mlp": 1.01834309, + "epoch": 0.31431491613058377, + "flos": 19239788280960.0, + "grad_norm": 1.8428816284518195, + "language_loss": 0.79341722, + "learning_rate": 3.2104553473009757e-06, + "loss": 0.81540883, + "num_input_tokens_seen": 56228200, + "step": 2614, + "time_per_iteration": 2.6002743244171143 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.05161834, + "balance_loss_mlp": 1.02038729, + "epoch": 0.3144351590212229, + "flos": 36210596290560.0, + "grad_norm": 2.0145890970336846, + "language_loss": 0.68034756, + "learning_rate": 3.209835153235399e-06, + "loss": 0.70198256, + "num_input_tokens_seen": 56249755, + "step": 2615, + "time_per_iteration": 2.747421979904175 + }, + { + "auxiliary_loss_clip": 0.01144817, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.05205631, + "balance_loss_mlp": 1.02116847, + "epoch": 0.314555401911862, + "flos": 18551740285440.0, + "grad_norm": 1.982516614324758, + "language_loss": 0.67601323, + "learning_rate": 3.2092147756387916e-06, + "loss": 0.69775939, + "num_input_tokens_seen": 56270080, + "step": 2616, + "time_per_iteration": 2.594224691390991 + }, + { + "auxiliary_loss_clip": 0.0116238, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.05348861, + "balance_loss_mlp": 1.02202916, + "epoch": 0.31467564480250104, + "flos": 16362877299840.0, + "grad_norm": 2.235411078431745, + "language_loss": 0.83681297, + "learning_rate": 3.208594214605264e-06, + "loss": 0.85875446, + "num_input_tokens_seen": 56288625, + "step": 2617, + "time_per_iteration": 2.5438573360443115 + }, + { + "auxiliary_loss_clip": 0.01158166, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.05377257, + "balance_loss_mlp": 1.02332664, + "epoch": 0.31479588769314015, + "flos": 21652375127040.0, + "grad_norm": 1.9088795915386598, + "language_loss": 0.77119839, + "learning_rate": 3.2079734702289553e-06, + "loss": 0.79309744, + "num_input_tokens_seen": 56307520, + "step": 2618, + "time_per_iteration": 2.5629146099090576 + }, + { + "auxiliary_loss_clip": 0.01080781, + "auxiliary_loss_mlp": 0.00753882, + "balance_loss_clip": 1.02235162, + "balance_loss_mlp": 1.00093412, + "epoch": 0.3149161305837792, + "flos": 66051072040320.0, + "grad_norm": 0.8130474375700619, + "language_loss": 0.60432166, + "learning_rate": 3.207352542604031e-06, + "loss": 0.62266827, + "num_input_tokens_seen": 56369855, + "step": 2619, + "time_per_iteration": 3.16501522064209 + }, + { + "auxiliary_loss_clip": 0.0114237, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.05244112, + "balance_loss_mlp": 1.02701545, + "epoch": 0.3150363734744183, + "flos": 28987201192320.0, + "grad_norm": 1.472725751268888, + "language_loss": 0.78521335, + "learning_rate": 3.2067314318246864e-06, + "loss": 0.80699223, + "num_input_tokens_seen": 56390570, + "step": 2620, + "time_per_iteration": 2.6150474548339844 + }, + { + "auxiliary_loss_clip": 0.0116014, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.05875874, + "balance_loss_mlp": 1.01849079, + "epoch": 0.31515661636505743, + "flos": 27636600879360.0, + "grad_norm": 1.834349620895664, + "language_loss": 0.77787185, + "learning_rate": 3.206110137985143e-06, + "loss": 0.79974866, + "num_input_tokens_seen": 56410775, + "step": 2621, + "time_per_iteration": 2.601893663406372 + }, + { + "auxiliary_loss_clip": 0.01142134, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.05157673, + "balance_loss_mlp": 1.02236319, + "epoch": 0.3152768592556965, + "flos": 24605632465920.0, + "grad_norm": 2.046596875939888, + "language_loss": 0.92146122, + "learning_rate": 3.2054886611796505e-06, + "loss": 0.94320112, + "num_input_tokens_seen": 56429770, + "step": 2622, + "time_per_iteration": 2.5968804359436035 + }, + { + "auxiliary_loss_clip": 0.01092255, + "auxiliary_loss_mlp": 0.01005424, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.00423741, + "epoch": 0.3153971021463356, + "flos": 68476908026880.0, + "grad_norm": 0.9058345926050926, + "language_loss": 0.63517946, + "learning_rate": 3.204867001502487e-06, + "loss": 0.65615624, + "num_input_tokens_seen": 56488425, + "step": 2623, + "time_per_iteration": 3.0292139053344727 + }, + { + "auxiliary_loss_clip": 0.01200761, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.06225538, + "balance_loss_mlp": 1.02772212, + "epoch": 0.3155173450369747, + "flos": 25593714766080.0, + "grad_norm": 2.7139432294518127, + "language_loss": 0.80709815, + "learning_rate": 3.2042451590479567e-06, + "loss": 0.82948345, + "num_input_tokens_seen": 56508940, + "step": 2624, + "time_per_iteration": 2.5205681324005127 + }, + { + "auxiliary_loss_clip": 0.01195533, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.06097662, + "balance_loss_mlp": 1.02177453, + "epoch": 0.31563758792761376, + "flos": 24309333175680.0, + "grad_norm": 1.6041019879332563, + "language_loss": 0.86744505, + "learning_rate": 3.203623133910394e-06, + "loss": 0.88970655, + "num_input_tokens_seen": 56527245, + "step": 2625, + "time_per_iteration": 2.5068979263305664 + }, + { + "auxiliary_loss_clip": 0.0112798, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.05176115, + "balance_loss_mlp": 1.02295136, + "epoch": 0.31575783081825287, + "flos": 31903865550720.0, + "grad_norm": 2.8678377148333003, + "language_loss": 0.77635121, + "learning_rate": 3.203000926184158e-06, + "loss": 0.79794919, + "num_input_tokens_seen": 56546170, + "step": 2626, + "time_per_iteration": 2.7037241458892822 + }, + { + "auxiliary_loss_clip": 0.01196746, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.06001151, + "balance_loss_mlp": 1.02461886, + "epoch": 0.315878073708892, + "flos": 30810960385920.0, + "grad_norm": 1.60115179440242, + "language_loss": 0.77441078, + "learning_rate": 3.202378535963639e-06, + "loss": 0.79670763, + "num_input_tokens_seen": 56567085, + "step": 2627, + "time_per_iteration": 2.5422019958496094 + }, + { + "auxiliary_loss_clip": 0.01160373, + "auxiliary_loss_mlp": 0.00764797, + "balance_loss_clip": 1.0549165, + "balance_loss_mlp": 1.00059295, + "epoch": 0.31599831659953104, + "flos": 22200264253440.0, + "grad_norm": 1.7362295986004062, + "language_loss": 0.83955228, + "learning_rate": 3.2017559633432516e-06, + "loss": 0.85880399, + "num_input_tokens_seen": 56586715, + "step": 2628, + "time_per_iteration": 2.552711009979248 + }, + { + "auxiliary_loss_clip": 0.01175928, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.05587053, + "balance_loss_mlp": 1.02692628, + "epoch": 0.31611855949017015, + "flos": 25593463370880.0, + "grad_norm": 3.1274857547054555, + "language_loss": 0.66390634, + "learning_rate": 3.2011332084174398e-06, + "loss": 0.68602282, + "num_input_tokens_seen": 56607585, + "step": 2629, + "time_per_iteration": 2.5938720703125 + }, + { + "auxiliary_loss_clip": 0.0118026, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.05879664, + "balance_loss_mlp": 1.02122974, + "epoch": 0.31623880238080926, + "flos": 20594087694720.0, + "grad_norm": 1.806039296557657, + "language_loss": 0.89102179, + "learning_rate": 3.2005102712806756e-06, + "loss": 0.91312522, + "num_input_tokens_seen": 56626415, + "step": 2630, + "time_per_iteration": 3.357517719268799 + }, + { + "auxiliary_loss_clip": 0.01185915, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.05891585, + "balance_loss_mlp": 1.02447951, + "epoch": 0.3163590452714483, + "flos": 12784917600000.0, + "grad_norm": 2.2428645288132847, + "language_loss": 0.72755176, + "learning_rate": 3.1998871520274575e-06, + "loss": 0.74974728, + "num_input_tokens_seen": 56641750, + "step": 2631, + "time_per_iteration": 2.4795680046081543 + }, + { + "auxiliary_loss_clip": 0.01169185, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.05409968, + "balance_loss_mlp": 1.02701151, + "epoch": 0.3164792881620874, + "flos": 23041292273280.0, + "grad_norm": 1.9105310617123425, + "language_loss": 0.84661406, + "learning_rate": 3.199263850752312e-06, + "loss": 0.86866462, + "num_input_tokens_seen": 56662585, + "step": 2632, + "time_per_iteration": 2.5387017726898193 + }, + { + "auxiliary_loss_clip": 0.01183183, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.05679798, + "balance_loss_mlp": 1.02616882, + "epoch": 0.31659953105272653, + "flos": 18296271780480.0, + "grad_norm": 1.9980979232432317, + "language_loss": 0.85386515, + "learning_rate": 3.198640367549795e-06, + "loss": 0.8760556, + "num_input_tokens_seen": 56681480, + "step": 2633, + "time_per_iteration": 2.5256948471069336 + }, + { + "auxiliary_loss_clip": 0.01182296, + "auxiliary_loss_mlp": 0.00763639, + "balance_loss_clip": 1.05698788, + "balance_loss_mlp": 1.00052941, + "epoch": 0.3167197739433656, + "flos": 25703421880320.0, + "grad_norm": 1.6505522849425667, + "language_loss": 0.85933113, + "learning_rate": 3.198016702514487e-06, + "loss": 0.8787905, + "num_input_tokens_seen": 56701760, + "step": 2634, + "time_per_iteration": 2.5501585006713867 + }, + { + "auxiliary_loss_clip": 0.01195496, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.05887139, + "balance_loss_mlp": 1.02205682, + "epoch": 0.3168400168340047, + "flos": 23546016230400.0, + "grad_norm": 1.583378822339557, + "language_loss": 0.84266776, + "learning_rate": 3.1973928557409972e-06, + "loss": 0.86493123, + "num_input_tokens_seen": 56719800, + "step": 2635, + "time_per_iteration": 5.027114629745483 + }, + { + "auxiliary_loss_clip": 0.01193628, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.05843687, + "balance_loss_mlp": 1.02272344, + "epoch": 0.31696025972464376, + "flos": 28366449327360.0, + "grad_norm": 1.9198474815738502, + "language_loss": 0.71574003, + "learning_rate": 3.1967688273239636e-06, + "loss": 0.73799217, + "num_input_tokens_seen": 56739605, + "step": 2636, + "time_per_iteration": 2.531128406524658 + }, + { + "auxiliary_loss_clip": 0.01154394, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.05565, + "balance_loss_mlp": 1.0230881, + "epoch": 0.31708050261528287, + "flos": 16399111144320.0, + "grad_norm": 1.6785021220979, + "language_loss": 0.82015347, + "learning_rate": 3.1961446173580503e-06, + "loss": 0.84201783, + "num_input_tokens_seen": 56756545, + "step": 2637, + "time_per_iteration": 2.5340161323547363 + }, + { + "auxiliary_loss_clip": 0.01165216, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.05676579, + "balance_loss_mlp": 1.02384412, + "epoch": 0.317200745505922, + "flos": 26212347728640.0, + "grad_norm": 1.648827139568316, + "language_loss": 0.77255428, + "learning_rate": 3.1955202259379502e-06, + "loss": 0.79453117, + "num_input_tokens_seen": 56778275, + "step": 2638, + "time_per_iteration": 2.572671890258789 + }, + { + "auxiliary_loss_clip": 0.01179761, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.05564928, + "balance_loss_mlp": 1.02509892, + "epoch": 0.31732098839656103, + "flos": 31350876693120.0, + "grad_norm": 2.1162610733862754, + "language_loss": 0.82747221, + "learning_rate": 3.194895653158381e-06, + "loss": 0.84960705, + "num_input_tokens_seen": 56797215, + "step": 2639, + "time_per_iteration": 2.5851449966430664 + }, + { + "auxiliary_loss_clip": 0.01089319, + "auxiliary_loss_mlp": 0.01001988, + "balance_loss_clip": 1.01750064, + "balance_loss_mlp": 1.00092149, + "epoch": 0.31744123128720014, + "flos": 58989024835200.0, + "grad_norm": 0.9047474169079357, + "language_loss": 0.55607307, + "learning_rate": 3.194270899114093e-06, + "loss": 0.57698613, + "num_input_tokens_seen": 56863010, + "step": 2640, + "time_per_iteration": 3.142221689224243 + }, + { + "auxiliary_loss_clip": 0.01187806, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.05908442, + "balance_loss_mlp": 1.02434397, + "epoch": 0.31756147417783925, + "flos": 17417573372160.0, + "grad_norm": 2.0362146685051843, + "language_loss": 0.81996644, + "learning_rate": 3.193645963899858e-06, + "loss": 0.84218907, + "num_input_tokens_seen": 56880625, + "step": 2641, + "time_per_iteration": 2.5452685356140137 + }, + { + "auxiliary_loss_clip": 0.01161364, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.05526114, + "balance_loss_mlp": 1.01926208, + "epoch": 0.3176817170684783, + "flos": 25481673267840.0, + "grad_norm": 1.7722952033375337, + "language_loss": 0.84076703, + "learning_rate": 3.193020847610479e-06, + "loss": 0.86266017, + "num_input_tokens_seen": 56900945, + "step": 2642, + "time_per_iteration": 2.5563437938690186 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.05836082, + "balance_loss_mlp": 1.02279413, + "epoch": 0.3178019599591174, + "flos": 24972603765120.0, + "grad_norm": 2.0340945530910575, + "language_loss": 0.71406412, + "learning_rate": 3.192395550340787e-06, + "loss": 0.73601568, + "num_input_tokens_seen": 56918895, + "step": 2643, + "time_per_iteration": 2.5983755588531494 + }, + { + "auxiliary_loss_clip": 0.01182187, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.05884242, + "balance_loss_mlp": 1.02141619, + "epoch": 0.31792220284975653, + "flos": 12422220019200.0, + "grad_norm": 1.852321342700682, + "language_loss": 0.76504725, + "learning_rate": 3.191770072185638e-06, + "loss": 0.78716767, + "num_input_tokens_seen": 56935890, + "step": 2644, + "time_per_iteration": 2.5009708404541016 + }, + { + "auxiliary_loss_clip": 0.01180588, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.05742192, + "balance_loss_mlp": 1.0278852, + "epoch": 0.3180424457403956, + "flos": 15485759089920.0, + "grad_norm": 2.395451684155793, + "language_loss": 0.72973293, + "learning_rate": 3.191144413239916e-06, + "loss": 0.75191009, + "num_input_tokens_seen": 56952460, + "step": 2645, + "time_per_iteration": 2.4313387870788574 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.05692148, + "balance_loss_mlp": 1.02751279, + "epoch": 0.3181626886310347, + "flos": 26174964648960.0, + "grad_norm": 1.983108536040906, + "language_loss": 0.88373291, + "learning_rate": 3.190518573598534e-06, + "loss": 0.90579337, + "num_input_tokens_seen": 56969065, + "step": 2646, + "time_per_iteration": 2.5359935760498047 + }, + { + "auxiliary_loss_clip": 0.01159165, + "auxiliary_loss_mlp": 0.01037176, + "balance_loss_clip": 1.05368567, + "balance_loss_mlp": 1.02776408, + "epoch": 0.3182829315216738, + "flos": 25483109811840.0, + "grad_norm": 2.122215686681643, + "language_loss": 0.77337086, + "learning_rate": 3.1898925533564308e-06, + "loss": 0.79533428, + "num_input_tokens_seen": 56990535, + "step": 2647, + "time_per_iteration": 2.6039280891418457 + }, + { + "auxiliary_loss_clip": 0.01142102, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.05286932, + "balance_loss_mlp": 1.02556622, + "epoch": 0.31840317441231286, + "flos": 18113701927680.0, + "grad_norm": 2.342769357604192, + "language_loss": 0.64212525, + "learning_rate": 3.1892663526085733e-06, + "loss": 0.66389084, + "num_input_tokens_seen": 57008910, + "step": 2648, + "time_per_iteration": 2.537564516067505 + }, + { + "auxiliary_loss_clip": 0.01088266, + "auxiliary_loss_mlp": 0.01004107, + "balance_loss_clip": 1.01677966, + "balance_loss_mlp": 1.00302792, + "epoch": 0.31852341730295197, + "flos": 64741948957440.0, + "grad_norm": 0.7484988669303206, + "language_loss": 0.56965578, + "learning_rate": 3.188639971449956e-06, + "loss": 0.59057951, + "num_input_tokens_seen": 57074960, + "step": 2649, + "time_per_iteration": 3.0121805667877197 + }, + { + "auxiliary_loss_clip": 0.01197483, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.05930972, + "balance_loss_mlp": 1.01973331, + "epoch": 0.318643660193591, + "flos": 20668135582080.0, + "grad_norm": 2.331412653515872, + "language_loss": 0.72185898, + "learning_rate": 3.1880134099756e-06, + "loss": 0.74411851, + "num_input_tokens_seen": 57094595, + "step": 2650, + "time_per_iteration": 2.5160577297210693 + }, + { + "auxiliary_loss_clip": 0.01177795, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.05430102, + "balance_loss_mlp": 1.01451826, + "epoch": 0.31876390308423014, + "flos": 26943345411840.0, + "grad_norm": 1.9460425718120204, + "language_loss": 0.69406295, + "learning_rate": 3.1873866682805535e-06, + "loss": 0.71607339, + "num_input_tokens_seen": 57115290, + "step": 2651, + "time_per_iteration": 2.5709779262542725 + }, + { + "auxiliary_loss_clip": 0.01171562, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.05688798, + "balance_loss_mlp": 1.02476549, + "epoch": 0.31888414597486925, + "flos": 18041916597120.0, + "grad_norm": 1.938450377907265, + "language_loss": 0.88781404, + "learning_rate": 3.186759746459894e-06, + "loss": 0.90986675, + "num_input_tokens_seen": 57134400, + "step": 2652, + "time_per_iteration": 2.4988715648651123 + }, + { + "auxiliary_loss_clip": 0.01167836, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.05722857, + "balance_loss_mlp": 1.01909363, + "epoch": 0.3190043888655083, + "flos": 25149319701120.0, + "grad_norm": 1.7924446837430996, + "language_loss": 0.79769063, + "learning_rate": 3.1861326446087246e-06, + "loss": 0.81965047, + "num_input_tokens_seen": 57153140, + "step": 2653, + "time_per_iteration": 2.556647777557373 + }, + { + "auxiliary_loss_clip": 0.01183329, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.05700707, + "balance_loss_mlp": 1.02064347, + "epoch": 0.3191246317561474, + "flos": 22053892331520.0, + "grad_norm": 2.52207124254031, + "language_loss": 0.71587068, + "learning_rate": 3.1855053628221763e-06, + "loss": 0.73800838, + "num_input_tokens_seen": 57172395, + "step": 2654, + "time_per_iteration": 2.5029003620147705 + }, + { + "auxiliary_loss_clip": 0.01143666, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.04982662, + "balance_loss_mlp": 1.0214119, + "epoch": 0.3192448746467865, + "flos": 14901815687040.0, + "grad_norm": 2.867701481127341, + "language_loss": 0.8983897, + "learning_rate": 3.184877901195407e-06, + "loss": 0.92013454, + "num_input_tokens_seen": 57189090, + "step": 2655, + "time_per_iteration": 2.5815958976745605 + }, + { + "auxiliary_loss_clip": 0.01074607, + "auxiliary_loss_mlp": 0.01010327, + "balance_loss_clip": 1.0312084, + "balance_loss_mlp": 1.00793076, + "epoch": 0.3193651175374256, + "flos": 67234832657280.0, + "grad_norm": 0.8000441545952708, + "language_loss": 0.62858403, + "learning_rate": 3.184250259823602e-06, + "loss": 0.64943337, + "num_input_tokens_seen": 57251620, + "step": 2656, + "time_per_iteration": 3.179706573486328 + }, + { + "auxiliary_loss_clip": 0.01151327, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.05284452, + "balance_loss_mlp": 1.02278924, + "epoch": 0.3194853604280647, + "flos": 12233077977600.0, + "grad_norm": 2.10505701184891, + "language_loss": 0.81961387, + "learning_rate": 3.183622438801974e-06, + "loss": 0.84145105, + "num_input_tokens_seen": 57266910, + "step": 2657, + "time_per_iteration": 3.364516496658325 + }, + { + "auxiliary_loss_clip": 0.01198138, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.06061089, + "balance_loss_mlp": 1.02889752, + "epoch": 0.3196056033187038, + "flos": 14939917038720.0, + "grad_norm": 2.0034519755168665, + "language_loss": 0.75626612, + "learning_rate": 3.1829944382257637e-06, + "loss": 0.77862084, + "num_input_tokens_seen": 57285040, + "step": 2658, + "time_per_iteration": 2.4232571125030518 + }, + { + "auxiliary_loss_clip": 0.01178779, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.05815458, + "balance_loss_mlp": 1.02107072, + "epoch": 0.31972584620934286, + "flos": 23768878164480.0, + "grad_norm": 2.5705549797136262, + "language_loss": 0.81624818, + "learning_rate": 3.1823662581902373e-06, + "loss": 0.83833075, + "num_input_tokens_seen": 57302725, + "step": 2659, + "time_per_iteration": 2.488865852355957 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.04705095, + "balance_loss_mlp": 1.01924896, + "epoch": 0.31984608909998197, + "flos": 21251540280960.0, + "grad_norm": 2.2266351166514236, + "language_loss": 0.74425817, + "learning_rate": 3.1817378987906896e-06, + "loss": 0.76591432, + "num_input_tokens_seen": 57322230, + "step": 2660, + "time_per_iteration": 2.53401255607605 + }, + { + "auxiliary_loss_clip": 0.0112966, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.05116057, + "balance_loss_mlp": 1.02820945, + "epoch": 0.3199663319906211, + "flos": 18296235866880.0, + "grad_norm": 1.987371293722694, + "language_loss": 0.79672003, + "learning_rate": 3.181109360122442e-06, + "loss": 0.8183893, + "num_input_tokens_seen": 57339820, + "step": 2661, + "time_per_iteration": 3.4290072917938232 + }, + { + "auxiliary_loss_clip": 0.01146868, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.05140936, + "balance_loss_mlp": 1.02025127, + "epoch": 0.32008657488126013, + "flos": 18733627779840.0, + "grad_norm": 2.425880746601867, + "language_loss": 0.78360391, + "learning_rate": 3.1804806422808445e-06, + "loss": 0.80536491, + "num_input_tokens_seen": 57356955, + "step": 2662, + "time_per_iteration": 4.198965787887573 + }, + { + "auxiliary_loss_clip": 0.01156041, + "auxiliary_loss_mlp": 0.01036098, + "balance_loss_clip": 1.05287266, + "balance_loss_mlp": 1.02707434, + "epoch": 0.32020681777189924, + "flos": 20595344670720.0, + "grad_norm": 1.7860179000476801, + "language_loss": 0.72918212, + "learning_rate": 3.1798517453612714e-06, + "loss": 0.75110352, + "num_input_tokens_seen": 57376760, + "step": 2663, + "time_per_iteration": 2.5562779903411865 + }, + { + "auxiliary_loss_clip": 0.01179521, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.06063056, + "balance_loss_mlp": 1.02580774, + "epoch": 0.32032706066253835, + "flos": 35261692750080.0, + "grad_norm": 1.773240893572529, + "language_loss": 0.74971962, + "learning_rate": 3.1792226694591265e-06, + "loss": 0.77185988, + "num_input_tokens_seen": 57398145, + "step": 2664, + "time_per_iteration": 2.658367156982422 + }, + { + "auxiliary_loss_clip": 0.0114839, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.0551343, + "balance_loss_mlp": 1.01968193, + "epoch": 0.3204473035531774, + "flos": 15304230731520.0, + "grad_norm": 2.03338043129447, + "language_loss": 0.80644727, + "learning_rate": 3.178593414669841e-06, + "loss": 0.82821238, + "num_input_tokens_seen": 57416730, + "step": 2665, + "time_per_iteration": 2.598355293273926 + }, + { + "auxiliary_loss_clip": 0.01183026, + "auxiliary_loss_mlp": 0.01026295, + "balance_loss_clip": 1.05799913, + "balance_loss_mlp": 1.01747918, + "epoch": 0.3205675464438165, + "flos": 24462564595200.0, + "grad_norm": 2.8005865739918296, + "language_loss": 0.7041229, + "learning_rate": 3.1779639810888707e-06, + "loss": 0.72621614, + "num_input_tokens_seen": 57436325, + "step": 2666, + "time_per_iteration": 2.521804094314575 + }, + { + "auxiliary_loss_clip": 0.01179358, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.05889463, + "balance_loss_mlp": 1.02268636, + "epoch": 0.3206877893344556, + "flos": 22456235548800.0, + "grad_norm": 1.786391339346807, + "language_loss": 0.75545996, + "learning_rate": 3.1773343688117013e-06, + "loss": 0.77757043, + "num_input_tokens_seen": 57457235, + "step": 2667, + "time_per_iteration": 2.5075602531433105 + }, + { + "auxiliary_loss_clip": 0.01169584, + "auxiliary_loss_mlp": 0.00763307, + "balance_loss_clip": 1.05522203, + "balance_loss_mlp": 1.00067472, + "epoch": 0.3208080322250947, + "flos": 20412236113920.0, + "grad_norm": 4.225036790900256, + "language_loss": 0.84068, + "learning_rate": 3.1767045779338445e-06, + "loss": 0.8600089, + "num_input_tokens_seen": 57474895, + "step": 2668, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01176118, + "auxiliary_loss_mlp": 0.01026263, + "balance_loss_clip": 1.05298924, + "balance_loss_mlp": 1.01857424, + "epoch": 0.3209282751157338, + "flos": 21762118154880.0, + "grad_norm": 1.941818888435655, + "language_loss": 0.91274035, + "learning_rate": 3.176074608550839e-06, + "loss": 0.93476427, + "num_input_tokens_seen": 57490715, + "step": 2669, + "time_per_iteration": 2.5570433139801025 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.05132246, + "balance_loss_mlp": 1.02551031, + "epoch": 0.32104851800637285, + "flos": 22055041566720.0, + "grad_norm": 2.381875996007557, + "language_loss": 0.82460862, + "learning_rate": 3.17544446075825e-06, + "loss": 0.84619123, + "num_input_tokens_seen": 57509880, + "step": 2670, + "time_per_iteration": 2.651054859161377 + }, + { + "auxiliary_loss_clip": 0.01168614, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.05406141, + "balance_loss_mlp": 1.02391744, + "epoch": 0.32116876089701196, + "flos": 37012301896320.0, + "grad_norm": 1.597889973427214, + "language_loss": 0.70859063, + "learning_rate": 3.174814134651671e-06, + "loss": 0.73059708, + "num_input_tokens_seen": 57532430, + "step": 2671, + "time_per_iteration": 2.6753392219543457 + }, + { + "auxiliary_loss_clip": 0.0119022, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.05826557, + "balance_loss_mlp": 1.0209918, + "epoch": 0.3212890037876511, + "flos": 21979233912960.0, + "grad_norm": 1.9115989746728388, + "language_loss": 0.80422068, + "learning_rate": 3.1741836303267215e-06, + "loss": 0.82641876, + "num_input_tokens_seen": 57551965, + "step": 2672, + "time_per_iteration": 2.563977003097534 + }, + { + "auxiliary_loss_clip": 0.01192452, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.05839014, + "balance_loss_mlp": 1.02112865, + "epoch": 0.32140924667829013, + "flos": 10342345875840.0, + "grad_norm": 1.8069210179413981, + "language_loss": 0.74914384, + "learning_rate": 3.1735529478790496e-06, + "loss": 0.77136183, + "num_input_tokens_seen": 57569955, + "step": 2673, + "time_per_iteration": 2.4567229747772217 + }, + { + "auxiliary_loss_clip": 0.01181318, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.05734229, + "balance_loss_mlp": 1.02689862, + "epoch": 0.32152948956892924, + "flos": 50798910072960.0, + "grad_norm": 2.121830759928693, + "language_loss": 0.79461741, + "learning_rate": 3.172922087404328e-06, + "loss": 0.81679416, + "num_input_tokens_seen": 57592215, + "step": 2674, + "time_per_iteration": 2.7686643600463867 + }, + { + "auxiliary_loss_clip": 0.01091215, + "auxiliary_loss_mlp": 0.01002056, + "balance_loss_clip": 1.02087653, + "balance_loss_mlp": 1.00110841, + "epoch": 0.32164973245956835, + "flos": 63863250549120.0, + "grad_norm": 0.7683724866515861, + "language_loss": 0.55282241, + "learning_rate": 3.1722910489982586e-06, + "loss": 0.57375509, + "num_input_tokens_seen": 57652575, + "step": 2675, + "time_per_iteration": 3.1243398189544678 + }, + { + "auxiliary_loss_clip": 0.01159543, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.05268073, + "balance_loss_mlp": 1.02466655, + "epoch": 0.3217699753502074, + "flos": 23513948363520.0, + "grad_norm": 1.482625143549675, + "language_loss": 0.80013525, + "learning_rate": 3.1716598327565694e-06, + "loss": 0.82206845, + "num_input_tokens_seen": 57672215, + "step": 2676, + "time_per_iteration": 2.5619418621063232 + }, + { + "auxiliary_loss_clip": 0.01190618, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.05724406, + "balance_loss_mlp": 1.02173281, + "epoch": 0.3218902182408465, + "flos": 19062533640960.0, + "grad_norm": 1.5643073483380618, + "language_loss": 0.84217495, + "learning_rate": 3.171028438775015e-06, + "loss": 0.86438751, + "num_input_tokens_seen": 57691410, + "step": 2677, + "time_per_iteration": 2.4965391159057617 + }, + { + "auxiliary_loss_clip": 0.01192187, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.05769992, + "balance_loss_mlp": 1.02070999, + "epoch": 0.3220104611314856, + "flos": 20375571306240.0, + "grad_norm": 1.858680983702436, + "language_loss": 0.84208, + "learning_rate": 3.170396867149377e-06, + "loss": 0.86429185, + "num_input_tokens_seen": 57709415, + "step": 2678, + "time_per_iteration": 2.4523134231567383 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.05200458, + "balance_loss_mlp": 1.02751482, + "epoch": 0.3221307040221247, + "flos": 20117014231680.0, + "grad_norm": 1.7938712894813198, + "language_loss": 0.86117458, + "learning_rate": 3.1697651179754653e-06, + "loss": 0.88283873, + "num_input_tokens_seen": 57728075, + "step": 2679, + "time_per_iteration": 2.612335205078125 + }, + { + "auxiliary_loss_clip": 0.01152921, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.05902421, + "balance_loss_mlp": 1.02272058, + "epoch": 0.3222509469127638, + "flos": 23987789602560.0, + "grad_norm": 1.782525146767077, + "language_loss": 0.73276806, + "learning_rate": 3.1691331913491153e-06, + "loss": 0.75461054, + "num_input_tokens_seen": 57750645, + "step": 2680, + "time_per_iteration": 2.628188371658325 + }, + { + "auxiliary_loss_clip": 0.0119234, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.05584121, + "balance_loss_mlp": 1.01950371, + "epoch": 0.32237118980340285, + "flos": 17675735397120.0, + "grad_norm": 1.990361221118503, + "language_loss": 0.84645855, + "learning_rate": 3.1685010873661898e-06, + "loss": 0.86865705, + "num_input_tokens_seen": 57769820, + "step": 2681, + "time_per_iteration": 2.4477040767669678 + }, + { + "auxiliary_loss_clip": 0.01177423, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.05705523, + "balance_loss_mlp": 1.0229274, + "epoch": 0.32249143269404196, + "flos": 23147982645120.0, + "grad_norm": 1.9073689939583216, + "language_loss": 0.79787964, + "learning_rate": 3.167868806122578e-06, + "loss": 0.81997758, + "num_input_tokens_seen": 57788870, + "step": 2682, + "time_per_iteration": 2.5127296447753906 + }, + { + "auxiliary_loss_clip": 0.01170678, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.0571593, + "balance_loss_mlp": 1.02176201, + "epoch": 0.32261167558468107, + "flos": 24422308427520.0, + "grad_norm": 1.7893641358590515, + "language_loss": 0.66334766, + "learning_rate": 3.1672363477141968e-06, + "loss": 0.68535942, + "num_input_tokens_seen": 57808165, + "step": 2683, + "time_per_iteration": 3.403834581375122 + }, + { + "auxiliary_loss_clip": 0.01167707, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.05328894, + "balance_loss_mlp": 1.02533042, + "epoch": 0.3227319184753201, + "flos": 30367175852160.0, + "grad_norm": 1.7531527269139193, + "language_loss": 0.85104823, + "learning_rate": 3.1666037122369903e-06, + "loss": 0.87306839, + "num_input_tokens_seen": 57828825, + "step": 2684, + "time_per_iteration": 2.5829105377197266 + }, + { + "auxiliary_loss_clip": 0.01177012, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.05405569, + "balance_loss_mlp": 1.02074575, + "epoch": 0.32285216136595923, + "flos": 16946174257920.0, + "grad_norm": 2.0704149156554, + "language_loss": 0.86442816, + "learning_rate": 3.165970899786928e-06, + "loss": 0.8864876, + "num_input_tokens_seen": 57846740, + "step": 2685, + "time_per_iteration": 2.5022048950195312 + }, + { + "auxiliary_loss_clip": 0.01154311, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.05366087, + "balance_loss_mlp": 1.02008939, + "epoch": 0.32297240425659834, + "flos": 21981532383360.0, + "grad_norm": 1.6916691925977374, + "language_loss": 0.75412464, + "learning_rate": 3.1653379104600067e-06, + "loss": 0.77595121, + "num_input_tokens_seen": 57866885, + "step": 2686, + "time_per_iteration": 2.5646004676818848 + }, + { + "auxiliary_loss_clip": 0.01176373, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.05480671, + "balance_loss_mlp": 1.02155828, + "epoch": 0.3230926471472374, + "flos": 22748045639040.0, + "grad_norm": 2.331889449696419, + "language_loss": 0.69665074, + "learning_rate": 3.164704744352251e-06, + "loss": 0.7187137, + "num_input_tokens_seen": 57887690, + "step": 2687, + "time_per_iteration": 2.5375418663024902 + }, + { + "auxiliary_loss_clip": 0.011763, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.05488431, + "balance_loss_mlp": 1.02422166, + "epoch": 0.3232128900378765, + "flos": 16942977947520.0, + "grad_norm": 2.105842305246496, + "language_loss": 0.80590993, + "learning_rate": 3.164071401559713e-06, + "loss": 0.82799304, + "num_input_tokens_seen": 57905090, + "step": 2688, + "time_per_iteration": 3.3633596897125244 + }, + { + "auxiliary_loss_clip": 0.01164644, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.054299, + "balance_loss_mlp": 1.02410626, + "epoch": 0.3233331329285156, + "flos": 24023736138240.0, + "grad_norm": 1.6717343602788888, + "language_loss": 0.70919359, + "learning_rate": 3.1634378821784674e-06, + "loss": 0.73116487, + "num_input_tokens_seen": 57925305, + "step": 2689, + "time_per_iteration": 4.2222888469696045 + }, + { + "auxiliary_loss_clip": 0.01153321, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.05504024, + "balance_loss_mlp": 1.02609301, + "epoch": 0.3234533758191547, + "flos": 18113845582080.0, + "grad_norm": 2.4492995973323644, + "language_loss": 0.74481857, + "learning_rate": 3.1628041863046208e-06, + "loss": 0.76669633, + "num_input_tokens_seen": 57942720, + "step": 2690, + "time_per_iteration": 2.558166265487671 + }, + { + "auxiliary_loss_clip": 0.01196038, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.05574393, + "balance_loss_mlp": 1.02316892, + "epoch": 0.3235736187097938, + "flos": 16946138344320.0, + "grad_norm": 2.21581339529008, + "language_loss": 0.90908998, + "learning_rate": 3.162170314034304e-06, + "loss": 0.93137413, + "num_input_tokens_seen": 57960135, + "step": 2691, + "time_per_iteration": 2.4545295238494873 + }, + { + "auxiliary_loss_clip": 0.01194613, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.05597401, + "balance_loss_mlp": 1.02337503, + "epoch": 0.3236938616004329, + "flos": 22127150119680.0, + "grad_norm": 1.69881504924168, + "language_loss": 0.80864364, + "learning_rate": 3.1615362654636738e-06, + "loss": 0.83091563, + "num_input_tokens_seen": 57980875, + "step": 2692, + "time_per_iteration": 2.4704015254974365 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.05474329, + "balance_loss_mlp": 1.02554703, + "epoch": 0.32381410449107195, + "flos": 17164618819200.0, + "grad_norm": 1.6989526200457512, + "language_loss": 0.87463701, + "learning_rate": 3.1609020406889163e-06, + "loss": 0.89640915, + "num_input_tokens_seen": 57998310, + "step": 2693, + "time_per_iteration": 2.5160059928894043 + }, + { + "auxiliary_loss_clip": 0.01167696, + "auxiliary_loss_mlp": 0.01033677, + "balance_loss_clip": 1.05462456, + "balance_loss_mlp": 1.02396131, + "epoch": 0.32393434738171106, + "flos": 16578125550720.0, + "grad_norm": 1.6470579250626114, + "language_loss": 0.85137844, + "learning_rate": 3.1602676398062416e-06, + "loss": 0.87339216, + "num_input_tokens_seen": 58017220, + "step": 2694, + "time_per_iteration": 2.51727294921875 + }, + { + "auxiliary_loss_clip": 0.0117627, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.05586851, + "balance_loss_mlp": 1.0191462, + "epoch": 0.3240545902723502, + "flos": 25483612602240.0, + "grad_norm": 2.051156024763937, + "language_loss": 0.61586684, + "learning_rate": 3.1596330629118886e-06, + "loss": 0.63791084, + "num_input_tokens_seen": 58037190, + "step": 2695, + "time_per_iteration": 2.571971893310547 + }, + { + "auxiliary_loss_clip": 0.01131357, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.0513432, + "balance_loss_mlp": 1.02285552, + "epoch": 0.32417483316298923, + "flos": 35845851634560.0, + "grad_norm": 1.9871282258802616, + "language_loss": 0.73243213, + "learning_rate": 3.1589983101021223e-06, + "loss": 0.75406164, + "num_input_tokens_seen": 58055820, + "step": 2696, + "time_per_iteration": 2.725529909133911 + }, + { + "auxiliary_loss_clip": 0.01165154, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.05454111, + "balance_loss_mlp": 1.01994681, + "epoch": 0.32429507605362834, + "flos": 30080501406720.0, + "grad_norm": 2.3291311033144093, + "language_loss": 0.85074127, + "learning_rate": 3.1583633814732337e-06, + "loss": 0.87267673, + "num_input_tokens_seen": 58075340, + "step": 2697, + "time_per_iteration": 2.5923664569854736 + }, + { + "auxiliary_loss_clip": 0.01190134, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.05493593, + "balance_loss_mlp": 1.02710152, + "epoch": 0.3244153189442674, + "flos": 18223265387520.0, + "grad_norm": 2.7287229313480172, + "language_loss": 0.71703166, + "learning_rate": 3.157728277121541e-06, + "loss": 0.73929197, + "num_input_tokens_seen": 58093515, + "step": 2698, + "time_per_iteration": 2.4554076194763184 + }, + { + "auxiliary_loss_clip": 0.01190914, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.05290318, + "balance_loss_mlp": 1.01988959, + "epoch": 0.3245355618349065, + "flos": 17710317216000.0, + "grad_norm": 2.557176730795796, + "language_loss": 0.78560084, + "learning_rate": 3.1570929971433897e-06, + "loss": 0.80779684, + "num_input_tokens_seen": 58109300, + "step": 2699, + "time_per_iteration": 2.4922051429748535 + }, + { + "auxiliary_loss_clip": 0.01178222, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.05738342, + "balance_loss_mlp": 1.02638793, + "epoch": 0.3246558047255456, + "flos": 23440798316160.0, + "grad_norm": 2.3486320779730523, + "language_loss": 0.83945686, + "learning_rate": 3.1564575416351504e-06, + "loss": 0.86158609, + "num_input_tokens_seen": 58128000, + "step": 2700, + "time_per_iteration": 2.5409977436065674 + }, + { + "auxiliary_loss_clip": 0.01192635, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.05661488, + "balance_loss_mlp": 1.01995742, + "epoch": 0.32477604761618467, + "flos": 21760861178880.0, + "grad_norm": 1.8382055656498018, + "language_loss": 0.74573141, + "learning_rate": 3.155821910693221e-06, + "loss": 0.76794744, + "num_input_tokens_seen": 58147415, + "step": 2701, + "time_per_iteration": 2.4598922729492188 + }, + { + "auxiliary_loss_clip": 0.01161604, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.05148828, + "balance_loss_mlp": 1.02149582, + "epoch": 0.3248962905068238, + "flos": 19828328624640.0, + "grad_norm": 1.77561343765119, + "language_loss": 0.85860306, + "learning_rate": 3.1551861044140275e-06, + "loss": 0.88052189, + "num_input_tokens_seen": 58167050, + "step": 2702, + "time_per_iteration": 2.545163869857788 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.0518229, + "balance_loss_mlp": 1.0226537, + "epoch": 0.3250165333974629, + "flos": 23948215793280.0, + "grad_norm": 2.518944028727769, + "language_loss": 0.77937633, + "learning_rate": 3.15455012289402e-06, + "loss": 0.80099034, + "num_input_tokens_seen": 58186695, + "step": 2703, + "time_per_iteration": 2.6312096118927 + }, + { + "auxiliary_loss_clip": 0.01179474, + "auxiliary_loss_mlp": 0.01029946, + "balance_loss_clip": 1.05674911, + "balance_loss_mlp": 1.02092755, + "epoch": 0.32513677628810195, + "flos": 23989333887360.0, + "grad_norm": 1.8105436482824648, + "language_loss": 0.84446788, + "learning_rate": 3.153913966229677e-06, + "loss": 0.86656213, + "num_input_tokens_seen": 58205815, + "step": 2704, + "time_per_iteration": 2.520097255706787 + }, + { + "auxiliary_loss_clip": 0.01082358, + "auxiliary_loss_mlp": 0.01004924, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.00370789, + "epoch": 0.32525701917874106, + "flos": 70655790009600.0, + "grad_norm": 0.6400157666945211, + "language_loss": 0.50297797, + "learning_rate": 3.1532776345175027e-06, + "loss": 0.5238508, + "num_input_tokens_seen": 58270960, + "step": 2705, + "time_per_iteration": 3.094642162322998 + }, + { + "auxiliary_loss_clip": 0.01190474, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.05572164, + "balance_loss_mlp": 1.02524841, + "epoch": 0.32537726206938017, + "flos": 19682639061120.0, + "grad_norm": 1.9398741628678617, + "language_loss": 0.78877556, + "learning_rate": 3.1526411278540285e-06, + "loss": 0.81102294, + "num_input_tokens_seen": 58289390, + "step": 2706, + "time_per_iteration": 2.4604310989379883 + }, + { + "auxiliary_loss_clip": 0.01170815, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.05296147, + "balance_loss_mlp": 1.02412748, + "epoch": 0.3254975049600192, + "flos": 28760999293440.0, + "grad_norm": 2.6649028613281573, + "language_loss": 0.8103748, + "learning_rate": 3.1520044463358116e-06, + "loss": 0.83241868, + "num_input_tokens_seen": 58306120, + "step": 2707, + "time_per_iteration": 2.578207015991211 + }, + { + "auxiliary_loss_clip": 0.01177063, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.05536747, + "balance_loss_mlp": 1.01990354, + "epoch": 0.32561774785065833, + "flos": 18877378008960.0, + "grad_norm": 1.5542523578386402, + "language_loss": 0.80113983, + "learning_rate": 3.151367590059436e-06, + "loss": 0.82319987, + "num_input_tokens_seen": 58324545, + "step": 2708, + "time_per_iteration": 2.4830994606018066 + }, + { + "auxiliary_loss_clip": 0.01193839, + "auxiliary_loss_mlp": 0.00763984, + "balance_loss_clip": 1.05644178, + "balance_loss_mlp": 1.00061584, + "epoch": 0.32573799074129745, + "flos": 23112107936640.0, + "grad_norm": 1.9158896819042956, + "language_loss": 0.86814183, + "learning_rate": 3.1507305591215117e-06, + "loss": 0.88772005, + "num_input_tokens_seen": 58342455, + "step": 2709, + "time_per_iteration": 2.5095303058624268 + }, + { + "auxiliary_loss_clip": 0.01081599, + "auxiliary_loss_mlp": 0.01000387, + "balance_loss_clip": 1.02072513, + "balance_loss_mlp": 0.99936146, + "epoch": 0.3258582336319365, + "flos": 71237650423680.0, + "grad_norm": 0.6710497388181448, + "language_loss": 0.55728269, + "learning_rate": 3.150093353618677e-06, + "loss": 0.57810253, + "num_input_tokens_seen": 58407185, + "step": 2710, + "time_per_iteration": 3.9900460243225098 + }, + { + "auxiliary_loss_clip": 0.01179995, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.05258071, + "balance_loss_mlp": 1.01936841, + "epoch": 0.3259784765225756, + "flos": 22456020067200.0, + "grad_norm": 2.3641429077897764, + "language_loss": 0.88119578, + "learning_rate": 3.149455973647596e-06, + "loss": 0.90327775, + "num_input_tokens_seen": 58425245, + "step": 2711, + "time_per_iteration": 2.496067762374878 + }, + { + "auxiliary_loss_clip": 0.01139049, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.04591322, + "balance_loss_mlp": 1.01880252, + "epoch": 0.32609871941321467, + "flos": 20484811543680.0, + "grad_norm": 1.7644762406713717, + "language_loss": 0.77162302, + "learning_rate": 3.1488184193049563e-06, + "loss": 0.79329693, + "num_input_tokens_seen": 58444780, + "step": 2712, + "time_per_iteration": 2.5653631687164307 + }, + { + "auxiliary_loss_clip": 0.0119313, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.05860734, + "balance_loss_mlp": 1.02093983, + "epoch": 0.3262189623038538, + "flos": 22416805393920.0, + "grad_norm": 1.5234617118566582, + "language_loss": 0.72071731, + "learning_rate": 3.1481806906874767e-06, + "loss": 0.74294233, + "num_input_tokens_seen": 58466090, + "step": 2713, + "time_per_iteration": 2.537545680999756 + }, + { + "auxiliary_loss_clip": 0.01191152, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.05639124, + "balance_loss_mlp": 1.02097738, + "epoch": 0.3263392051944929, + "flos": 20923496346240.0, + "grad_norm": 1.5222323815950405, + "language_loss": 0.87659419, + "learning_rate": 3.147542787891899e-06, + "loss": 0.89879698, + "num_input_tokens_seen": 58485435, + "step": 2714, + "time_per_iteration": 2.4770402908325195 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.05646634, + "balance_loss_mlp": 1.02497625, + "epoch": 0.32645944808513194, + "flos": 24025172682240.0, + "grad_norm": 2.226836444538743, + "language_loss": 0.75280476, + "learning_rate": 3.1469047110149926e-06, + "loss": 0.77480483, + "num_input_tokens_seen": 58504175, + "step": 2715, + "time_per_iteration": 3.446967363357544 + }, + { + "auxiliary_loss_clip": 0.01131137, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.05160069, + "balance_loss_mlp": 1.02020264, + "epoch": 0.32657969097577105, + "flos": 21032413361280.0, + "grad_norm": 1.9436120964076988, + "language_loss": 0.85275984, + "learning_rate": 3.146266460153554e-06, + "loss": 0.87435687, + "num_input_tokens_seen": 58523885, + "step": 2716, + "time_per_iteration": 3.363856077194214 + }, + { + "auxiliary_loss_clip": 0.01162522, + "auxiliary_loss_mlp": 0.00763834, + "balance_loss_clip": 1.05383217, + "balance_loss_mlp": 1.00063705, + "epoch": 0.32669993386641016, + "flos": 22710267509760.0, + "grad_norm": 1.8365836094478587, + "language_loss": 0.80179358, + "learning_rate": 3.145628035404404e-06, + "loss": 0.82105708, + "num_input_tokens_seen": 58543085, + "step": 2717, + "time_per_iteration": 2.5573678016662598 + }, + { + "auxiliary_loss_clip": 0.01079784, + "auxiliary_loss_mlp": 0.01004353, + "balance_loss_clip": 1.02022147, + "balance_loss_mlp": 1.00322008, + "epoch": 0.3268201767570492, + "flos": 72105718406400.0, + "grad_norm": 0.8800989363533821, + "language_loss": 0.57526404, + "learning_rate": 3.1449894368643922e-06, + "loss": 0.5961054, + "num_input_tokens_seen": 58605400, + "step": 2718, + "time_per_iteration": 3.1740405559539795 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.05382383, + "balance_loss_mlp": 1.02353776, + "epoch": 0.32694041964768833, + "flos": 24535175938560.0, + "grad_norm": 1.4842212095007867, + "language_loss": 0.71396226, + "learning_rate": 3.1443506646303934e-06, + "loss": 0.73577595, + "num_input_tokens_seen": 58626700, + "step": 2719, + "time_per_iteration": 2.5880401134490967 + }, + { + "auxiliary_loss_clip": 0.01181125, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.05421257, + "balance_loss_mlp": 1.01957762, + "epoch": 0.32706066253832744, + "flos": 33183003755520.0, + "grad_norm": 3.096262231864864, + "language_loss": 0.66281915, + "learning_rate": 3.1437117187993086e-06, + "loss": 0.68491626, + "num_input_tokens_seen": 58649020, + "step": 2720, + "time_per_iteration": 2.5920658111572266 + }, + { + "auxiliary_loss_clip": 0.01142924, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.04939818, + "balance_loss_mlp": 1.0276618, + "epoch": 0.3271809054289665, + "flos": 24061622008320.0, + "grad_norm": 1.6352889588221644, + "language_loss": 0.79710084, + "learning_rate": 3.143072599468065e-06, + "loss": 0.81889319, + "num_input_tokens_seen": 58668845, + "step": 2721, + "time_per_iteration": 2.5829741954803467 + }, + { + "auxiliary_loss_clip": 0.01162481, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.05459476, + "balance_loss_mlp": 1.01813388, + "epoch": 0.3273011483196056, + "flos": 38253769712640.0, + "grad_norm": 1.5041987902718992, + "language_loss": 0.75658202, + "learning_rate": 3.1424333067336174e-06, + "loss": 0.77847242, + "num_input_tokens_seen": 58691610, + "step": 2722, + "time_per_iteration": 2.6655757427215576 + }, + { + "auxiliary_loss_clip": 0.01182258, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.05412769, + "balance_loss_mlp": 1.02504921, + "epoch": 0.3274213912102447, + "flos": 29054389582080.0, + "grad_norm": 1.7790416177480768, + "language_loss": 0.78194147, + "learning_rate": 3.141793840692945e-06, + "loss": 0.80410838, + "num_input_tokens_seen": 58712360, + "step": 2723, + "time_per_iteration": 2.5465664863586426 + }, + { + "auxiliary_loss_clip": 0.01155013, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.05115497, + "balance_loss_mlp": 1.02207315, + "epoch": 0.32754163410088377, + "flos": 29133249891840.0, + "grad_norm": 2.141415948292631, + "language_loss": 0.61650538, + "learning_rate": 3.1411542014430553e-06, + "loss": 0.63837183, + "num_input_tokens_seen": 58733440, + "step": 2724, + "time_per_iteration": 2.591707706451416 + }, + { + "auxiliary_loss_clip": 0.01145698, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.04682374, + "balance_loss_mlp": 1.02284694, + "epoch": 0.3276618769915229, + "flos": 20631075724800.0, + "grad_norm": 1.6898622287564375, + "language_loss": 0.8172431, + "learning_rate": 3.1405143890809804e-06, + "loss": 0.83901304, + "num_input_tokens_seen": 58752735, + "step": 2725, + "time_per_iteration": 2.615417003631592 + }, + { + "auxiliary_loss_clip": 0.01161647, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.05358028, + "balance_loss_mlp": 1.01898193, + "epoch": 0.327782119882162, + "flos": 18657425076480.0, + "grad_norm": 1.7748133468683849, + "language_loss": 0.69959855, + "learning_rate": 3.1398744037037796e-06, + "loss": 0.72148514, + "num_input_tokens_seen": 58772070, + "step": 2726, + "time_per_iteration": 2.5282788276672363 + }, + { + "auxiliary_loss_clip": 0.01162873, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.05430639, + "balance_loss_mlp": 1.02123523, + "epoch": 0.32790236277280105, + "flos": 21795802133760.0, + "grad_norm": 3.5058638702400886, + "language_loss": 0.84428805, + "learning_rate": 3.139234245408538e-06, + "loss": 0.86621118, + "num_input_tokens_seen": 58790950, + "step": 2727, + "time_per_iteration": 2.5232949256896973 + }, + { + "auxiliary_loss_clip": 0.01150785, + "auxiliary_loss_mlp": 0.00763251, + "balance_loss_clip": 1.0541718, + "balance_loss_mlp": 1.0006001, + "epoch": 0.32802260566344016, + "flos": 23331414424320.0, + "grad_norm": 1.4027983021834594, + "language_loss": 0.76069951, + "learning_rate": 3.1385939142923666e-06, + "loss": 0.77983987, + "num_input_tokens_seen": 58813340, + "step": 2728, + "time_per_iteration": 2.567537784576416 + }, + { + "auxiliary_loss_clip": 0.01164984, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.05145645, + "balance_loss_mlp": 1.02325797, + "epoch": 0.3281428485540792, + "flos": 24206988349440.0, + "grad_norm": 1.961203247980162, + "language_loss": 0.78408444, + "learning_rate": 3.137953410452405e-06, + "loss": 0.80605674, + "num_input_tokens_seen": 58833610, + "step": 2729, + "time_per_iteration": 2.545344829559326 + }, + { + "auxiliary_loss_clip": 0.01158294, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.0496335, + "balance_loss_mlp": 1.02546036, + "epoch": 0.3282630914447183, + "flos": 34128962380800.0, + "grad_norm": 1.6421596659617923, + "language_loss": 0.74445462, + "learning_rate": 3.1373127339858146e-06, + "loss": 0.76637435, + "num_input_tokens_seen": 58856210, + "step": 2730, + "time_per_iteration": 2.644753932952881 + }, + { + "auxiliary_loss_clip": 0.01142428, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.04760504, + "balance_loss_mlp": 1.02037692, + "epoch": 0.32838333433535744, + "flos": 27600726170880.0, + "grad_norm": 1.8103094504922241, + "language_loss": 0.74423331, + "learning_rate": 3.136671884989787e-06, + "loss": 0.76593387, + "num_input_tokens_seen": 58876120, + "step": 2731, + "time_per_iteration": 2.620925188064575 + }, + { + "auxiliary_loss_clip": 0.0112396, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.04842675, + "balance_loss_mlp": 1.02072811, + "epoch": 0.3285035772259965, + "flos": 12349500935040.0, + "grad_norm": 2.1177205652376565, + "language_loss": 0.87565303, + "learning_rate": 3.1360308635615383e-06, + "loss": 0.89718866, + "num_input_tokens_seen": 58894660, + "step": 2732, + "time_per_iteration": 2.6295647621154785 + }, + { + "auxiliary_loss_clip": 0.0116973, + "auxiliary_loss_mlp": 0.0102941, + "balance_loss_clip": 1.05275822, + "balance_loss_mlp": 1.01976609, + "epoch": 0.3286238201166356, + "flos": 24316084932480.0, + "grad_norm": 1.9301541430485614, + "language_loss": 0.78520918, + "learning_rate": 3.135389669798311e-06, + "loss": 0.80720055, + "num_input_tokens_seen": 58912720, + "step": 2733, + "time_per_iteration": 2.5923287868499756 + }, + { + "auxiliary_loss_clip": 0.0117521, + "auxiliary_loss_mlp": 0.00763201, + "balance_loss_clip": 1.05306721, + "balance_loss_mlp": 1.00057244, + "epoch": 0.3287440630072747, + "flos": 21392812471680.0, + "grad_norm": 1.7195785694770358, + "language_loss": 0.80045044, + "learning_rate": 3.134748303797373e-06, + "loss": 0.81983453, + "num_input_tokens_seen": 58930090, + "step": 2734, + "time_per_iteration": 2.5224335193634033 + }, + { + "auxiliary_loss_clip": 0.01136794, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.04885435, + "balance_loss_mlp": 1.02257895, + "epoch": 0.32886430589791377, + "flos": 23732536579200.0, + "grad_norm": 1.789089162321881, + "language_loss": 0.81015676, + "learning_rate": 3.1341067656560203e-06, + "loss": 0.8318454, + "num_input_tokens_seen": 58947935, + "step": 2735, + "time_per_iteration": 2.609121084213257 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.05412519, + "balance_loss_mlp": 1.02098203, + "epoch": 0.3289845487885529, + "flos": 22418708814720.0, + "grad_norm": 2.005109041925455, + "language_loss": 0.86447185, + "learning_rate": 3.133465055471572e-06, + "loss": 0.88650972, + "num_input_tokens_seen": 58967720, + "step": 2736, + "time_per_iteration": 3.4520626068115234 + }, + { + "auxiliary_loss_clip": 0.01143588, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.04840648, + "balance_loss_mlp": 1.01983643, + "epoch": 0.329104791679192, + "flos": 19682603147520.0, + "grad_norm": 3.9249019448475315, + "language_loss": 0.66175836, + "learning_rate": 3.1328231733413767e-06, + "loss": 0.68347776, + "num_input_tokens_seen": 58984360, + "step": 2737, + "time_per_iteration": 2.5393660068511963 + }, + { + "auxiliary_loss_clip": 0.0117341, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.0530076, + "balance_loss_mlp": 1.02602851, + "epoch": 0.32922503456983104, + "flos": 15997234803840.0, + "grad_norm": 2.517977310543383, + "language_loss": 0.91078639, + "learning_rate": 3.1321811193628067e-06, + "loss": 0.93287444, + "num_input_tokens_seen": 59002505, + "step": 2738, + "time_per_iteration": 2.5558552742004395 + }, + { + "auxiliary_loss_clip": 0.01180166, + "auxiliary_loss_mlp": 0.00764131, + "balance_loss_clip": 1.0562675, + "balance_loss_mlp": 1.00049877, + "epoch": 0.32934527746047015, + "flos": 26834069260800.0, + "grad_norm": 1.692209889409344, + "language_loss": 0.69912708, + "learning_rate": 3.131538893633261e-06, + "loss": 0.71856999, + "num_input_tokens_seen": 59022065, + "step": 2739, + "time_per_iteration": 2.58483624458313 + }, + { + "auxiliary_loss_clip": 0.01193664, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.05687737, + "balance_loss_mlp": 1.02320099, + "epoch": 0.32946552035110926, + "flos": 23403774372480.0, + "grad_norm": 2.0316138901416703, + "language_loss": 0.78344131, + "learning_rate": 3.130896496250165e-06, + "loss": 0.80569446, + "num_input_tokens_seen": 59041890, + "step": 2740, + "time_per_iteration": 2.502199411392212 + }, + { + "auxiliary_loss_clip": 0.01193416, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.05590367, + "balance_loss_mlp": 1.02155089, + "epoch": 0.3295857632417483, + "flos": 14172470029440.0, + "grad_norm": 2.057239755727636, + "language_loss": 0.86492032, + "learning_rate": 3.1302539273109693e-06, + "loss": 0.88715571, + "num_input_tokens_seen": 59058715, + "step": 2741, + "time_per_iteration": 4.060149669647217 + }, + { + "auxiliary_loss_clip": 0.01158753, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.05486703, + "balance_loss_mlp": 1.02543414, + "epoch": 0.32970600613238743, + "flos": 22196708807040.0, + "grad_norm": 1.6136790967051624, + "language_loss": 0.803855, + "learning_rate": 3.1296111869131513e-06, + "loss": 0.82579249, + "num_input_tokens_seen": 59076140, + "step": 2742, + "time_per_iteration": 3.3554325103759766 + }, + { + "auxiliary_loss_clip": 0.01192164, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.05575359, + "balance_loss_mlp": 1.02224731, + "epoch": 0.32982624902302654, + "flos": 22053784590720.0, + "grad_norm": 1.8611195951837967, + "language_loss": 0.85763323, + "learning_rate": 3.1289682751542153e-06, + "loss": 0.87986243, + "num_input_tokens_seen": 59095700, + "step": 2743, + "time_per_iteration": 2.464463233947754 + }, + { + "auxiliary_loss_clip": 0.01176162, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.05371094, + "balance_loss_mlp": 1.020872, + "epoch": 0.3299464919136656, + "flos": 18661626967680.0, + "grad_norm": 1.9409555574641155, + "language_loss": 0.71338344, + "learning_rate": 3.1283251921316883e-06, + "loss": 0.73544085, + "num_input_tokens_seen": 59113445, + "step": 2744, + "time_per_iteration": 2.4867451190948486 + }, + { + "auxiliary_loss_clip": 0.01134698, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.05118132, + "balance_loss_mlp": 1.0244832, + "epoch": 0.3300667348043047, + "flos": 13407357404160.0, + "grad_norm": 2.913798876066495, + "language_loss": 0.81023258, + "learning_rate": 3.1276819379431277e-06, + "loss": 0.83192277, + "num_input_tokens_seen": 59131535, + "step": 2745, + "time_per_iteration": 2.5624399185180664 + }, + { + "auxiliary_loss_clip": 0.01173215, + "auxiliary_loss_mlp": 0.00764189, + "balance_loss_clip": 1.05453289, + "balance_loss_mlp": 1.00064063, + "epoch": 0.33018697769494376, + "flos": 15742556398080.0, + "grad_norm": 2.0936198590434225, + "language_loss": 0.75473702, + "learning_rate": 3.1270385126861134e-06, + "loss": 0.77411103, + "num_input_tokens_seen": 59149520, + "step": 2746, + "time_per_iteration": 2.4873788356781006 + }, + { + "auxiliary_loss_clip": 0.01195702, + "auxiliary_loss_mlp": 0.01034259, + "balance_loss_clip": 1.0569973, + "balance_loss_mlp": 1.02432871, + "epoch": 0.3303072205855829, + "flos": 18258601392000.0, + "grad_norm": 1.8831606611305816, + "language_loss": 0.82025903, + "learning_rate": 3.1263949164582533e-06, + "loss": 0.84255868, + "num_input_tokens_seen": 59169170, + "step": 2747, + "time_per_iteration": 2.4668898582458496 + }, + { + "auxiliary_loss_clip": 0.01192521, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.05309093, + "balance_loss_mlp": 1.02266765, + "epoch": 0.330427463476222, + "flos": 17749424148480.0, + "grad_norm": 4.539365216926317, + "language_loss": 0.78308821, + "learning_rate": 3.1257511493571797e-06, + "loss": 0.80532861, + "num_input_tokens_seen": 59187675, + "step": 2748, + "time_per_iteration": 2.4170444011688232 + }, + { + "auxiliary_loss_clip": 0.01150153, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.05056405, + "balance_loss_mlp": 1.02291489, + "epoch": 0.33054770636686104, + "flos": 27162580072320.0, + "grad_norm": 1.9490683824747084, + "language_loss": 0.78467047, + "learning_rate": 3.125107211480552e-06, + "loss": 0.80648762, + "num_input_tokens_seen": 59207610, + "step": 2749, + "time_per_iteration": 2.579108476638794 + }, + { + "auxiliary_loss_clip": 0.01119104, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.04829955, + "balance_loss_mlp": 1.02853203, + "epoch": 0.33066794925750015, + "flos": 20117193799680.0, + "grad_norm": 1.7075292249442584, + "language_loss": 0.79815769, + "learning_rate": 3.124463102926054e-06, + "loss": 0.81972325, + "num_input_tokens_seen": 59226945, + "step": 2750, + "time_per_iteration": 2.5984702110290527 + }, + { + "auxiliary_loss_clip": 0.0107862, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.02483416, + "balance_loss_mlp": 1.00163329, + "epoch": 0.33078819214813926, + "flos": 70642609718400.0, + "grad_norm": 0.7729465978551084, + "language_loss": 0.6163817, + "learning_rate": 3.1238188237913984e-06, + "loss": 0.63719541, + "num_input_tokens_seen": 59291485, + "step": 2751, + "time_per_iteration": 3.1397552490234375 + }, + { + "auxiliary_loss_clip": 0.01199678, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.05940342, + "balance_loss_mlp": 1.02455831, + "epoch": 0.3309084350387783, + "flos": 21141940907520.0, + "grad_norm": 2.5169710047634193, + "language_loss": 0.76210451, + "learning_rate": 3.1231743741743202e-06, + "loss": 0.78444135, + "num_input_tokens_seen": 59310990, + "step": 2752, + "time_per_iteration": 2.4696764945983887 + }, + { + "auxiliary_loss_clip": 0.01172715, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.05092049, + "balance_loss_mlp": 1.02516294, + "epoch": 0.3310286779294174, + "flos": 14209350318720.0, + "grad_norm": 2.4352076894350305, + "language_loss": 0.83444971, + "learning_rate": 3.122529754172582e-06, + "loss": 0.85651577, + "num_input_tokens_seen": 59327875, + "step": 2753, + "time_per_iteration": 2.48481822013855 + }, + { + "auxiliary_loss_clip": 0.0118076, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.05689394, + "balance_loss_mlp": 1.02652478, + "epoch": 0.33114892082005654, + "flos": 20778130005120.0, + "grad_norm": 1.9102499010941272, + "language_loss": 0.73010021, + "learning_rate": 3.1218849638839736e-06, + "loss": 0.75226587, + "num_input_tokens_seen": 59347135, + "step": 2754, + "time_per_iteration": 2.5567264556884766 + }, + { + "auxiliary_loss_clip": 0.01137548, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.04534566, + "balance_loss_mlp": 1.0262078, + "epoch": 0.3312691637106956, + "flos": 17090750499840.0, + "grad_norm": 1.6547585653169383, + "language_loss": 0.785092, + "learning_rate": 3.121240003406307e-06, + "loss": 0.80682856, + "num_input_tokens_seen": 59365985, + "step": 2755, + "time_per_iteration": 2.542545795440674 + }, + { + "auxiliary_loss_clip": 0.01153884, + "auxiliary_loss_mlp": 0.01031601, + "balance_loss_clip": 1.05309296, + "balance_loss_mlp": 1.02196908, + "epoch": 0.3313894066013347, + "flos": 29456230008960.0, + "grad_norm": 2.5690401948586534, + "language_loss": 0.72341871, + "learning_rate": 3.120594872837425e-06, + "loss": 0.74527359, + "num_input_tokens_seen": 59384655, + "step": 2756, + "time_per_iteration": 2.600107192993164 + }, + { + "auxiliary_loss_clip": 0.01078267, + "auxiliary_loss_mlp": 0.00754039, + "balance_loss_clip": 1.02038193, + "balance_loss_mlp": 1.00154066, + "epoch": 0.3315096494919738, + "flos": 61419242280960.0, + "grad_norm": 0.8304241367334306, + "language_loss": 0.62385881, + "learning_rate": 3.1199495722751906e-06, + "loss": 0.64218187, + "num_input_tokens_seen": 59444185, + "step": 2757, + "time_per_iteration": 3.1084578037261963 + }, + { + "auxiliary_loss_clip": 0.0113728, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.04860616, + "balance_loss_mlp": 1.02584529, + "epoch": 0.33162989238261287, + "flos": 21653057485440.0, + "grad_norm": 1.5867036401819143, + "language_loss": 0.84134471, + "learning_rate": 3.1193041018174972e-06, + "loss": 0.86307156, + "num_input_tokens_seen": 59464900, + "step": 2758, + "time_per_iteration": 2.5904479026794434 + }, + { + "auxiliary_loss_clip": 0.01182796, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.05649698, + "balance_loss_mlp": 1.01961744, + "epoch": 0.331750135273252, + "flos": 22674787850880.0, + "grad_norm": 2.518464440665844, + "language_loss": 0.94788033, + "learning_rate": 3.118658461562261e-06, + "loss": 0.96999556, + "num_input_tokens_seen": 59481000, + "step": 2759, + "time_per_iteration": 2.5496444702148438 + }, + { + "auxiliary_loss_clip": 0.0116562, + "auxiliary_loss_mlp": 0.01035404, + "balance_loss_clip": 1.05583405, + "balance_loss_mlp": 1.02552152, + "epoch": 0.33187037816389103, + "flos": 22746896403840.0, + "grad_norm": 1.6533177678466706, + "language_loss": 0.8507998, + "learning_rate": 3.118012651607426e-06, + "loss": 0.87281001, + "num_input_tokens_seen": 59502605, + "step": 2760, + "time_per_iteration": 2.5263097286224365 + }, + { + "auxiliary_loss_clip": 0.01193611, + "auxiliary_loss_mlp": 0.01037843, + "balance_loss_clip": 1.05622172, + "balance_loss_mlp": 1.02765107, + "epoch": 0.33199062105453014, + "flos": 19203769918080.0, + "grad_norm": 2.4254441292340485, + "language_loss": 0.83478636, + "learning_rate": 3.1173666720509603e-06, + "loss": 0.85710084, + "num_input_tokens_seen": 59519540, + "step": 2761, + "time_per_iteration": 2.4219202995300293 + }, + { + "auxiliary_loss_clip": 0.01168581, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.05312145, + "balance_loss_mlp": 1.02172756, + "epoch": 0.33211086394516925, + "flos": 31577006764800.0, + "grad_norm": 1.6923913410383216, + "language_loss": 0.68148553, + "learning_rate": 3.116720522990859e-06, + "loss": 0.70347857, + "num_input_tokens_seen": 59540415, + "step": 2762, + "time_per_iteration": 2.5745129585266113 + }, + { + "auxiliary_loss_clip": 0.01125152, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.05001163, + "balance_loss_mlp": 1.02435267, + "epoch": 0.3322311068358083, + "flos": 17932496791680.0, + "grad_norm": 1.9306088334356015, + "language_loss": 0.61995107, + "learning_rate": 3.116074204525142e-06, + "loss": 0.64153695, + "num_input_tokens_seen": 59558590, + "step": 2763, + "time_per_iteration": 3.3974854946136475 + }, + { + "auxiliary_loss_clip": 0.0117086, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.05436826, + "balance_loss_mlp": 1.02089024, + "epoch": 0.3323513497264474, + "flos": 32269831269120.0, + "grad_norm": 1.4331940432253774, + "language_loss": 0.83622736, + "learning_rate": 3.1154277167518553e-06, + "loss": 0.85822821, + "num_input_tokens_seen": 59580205, + "step": 2764, + "time_per_iteration": 2.5662119388580322 + }, + { + "auxiliary_loss_clip": 0.01062631, + "auxiliary_loss_mlp": 0.01001813, + "balance_loss_clip": 1.01838255, + "balance_loss_mlp": 1.00066233, + "epoch": 0.33247159261708653, + "flos": 52668674588160.0, + "grad_norm": 0.7804633477763853, + "language_loss": 0.59487081, + "learning_rate": 3.114781059769072e-06, + "loss": 0.61551523, + "num_input_tokens_seen": 59631530, + "step": 2765, + "time_per_iteration": 2.976964235305786 + }, + { + "auxiliary_loss_clip": 0.01162952, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.05310428, + "balance_loss_mlp": 1.02079248, + "epoch": 0.3325918355077256, + "flos": 27125232906240.0, + "grad_norm": 2.743449929994885, + "language_loss": 0.67059427, + "learning_rate": 3.1141342336748874e-06, + "loss": 0.69252712, + "num_input_tokens_seen": 59651090, + "step": 2766, + "time_per_iteration": 2.6151795387268066 + }, + { + "auxiliary_loss_clip": 0.01177667, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.05664277, + "balance_loss_mlp": 1.02500534, + "epoch": 0.3327120783983647, + "flos": 23664414435840.0, + "grad_norm": 1.4137262903912304, + "language_loss": 0.82222188, + "learning_rate": 3.1134872385674253e-06, + "loss": 0.84433889, + "num_input_tokens_seen": 59675245, + "step": 2767, + "time_per_iteration": 2.5766637325286865 + }, + { + "auxiliary_loss_clip": 0.01167436, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.05121827, + "balance_loss_mlp": 1.02471995, + "epoch": 0.3328323212890038, + "flos": 19171378828800.0, + "grad_norm": 1.6515907177798275, + "language_loss": 0.85343361, + "learning_rate": 3.1128400745448353e-06, + "loss": 0.8754499, + "num_input_tokens_seen": 59694625, + "step": 2768, + "time_per_iteration": 3.3697237968444824 + }, + { + "auxiliary_loss_clip": 0.01180835, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.05591393, + "balance_loss_mlp": 1.02310896, + "epoch": 0.33295256417964286, + "flos": 37706347463040.0, + "grad_norm": 2.169925744566654, + "language_loss": 0.62822759, + "learning_rate": 3.11219274170529e-06, + "loss": 0.65035838, + "num_input_tokens_seen": 59716435, + "step": 2769, + "time_per_iteration": 3.335397720336914 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.05101037, + "balance_loss_mlp": 1.02562094, + "epoch": 0.333072807070282, + "flos": 26505989412480.0, + "grad_norm": 14.509614954692156, + "language_loss": 0.81849802, + "learning_rate": 3.1115452401469903e-06, + "loss": 0.84041399, + "num_input_tokens_seen": 59736835, + "step": 2770, + "time_per_iteration": 2.556903123855591 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.04597116, + "balance_loss_mlp": 1.02481604, + "epoch": 0.3331930499609211, + "flos": 21430913823360.0, + "grad_norm": 2.1390266964790836, + "language_loss": 0.86334103, + "learning_rate": 3.1108975699681613e-06, + "loss": 0.88493425, + "num_input_tokens_seen": 59754230, + "step": 2771, + "time_per_iteration": 2.5740745067596436 + }, + { + "auxiliary_loss_clip": 0.01147929, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.05150056, + "balance_loss_mlp": 1.02792573, + "epoch": 0.33331329285156014, + "flos": 20659947281280.0, + "grad_norm": 1.874550957165198, + "language_loss": 0.71346569, + "learning_rate": 3.1102497312670542e-06, + "loss": 0.73531485, + "num_input_tokens_seen": 59772235, + "step": 2772, + "time_per_iteration": 2.539773464202881 + }, + { + "auxiliary_loss_clip": 0.01153109, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.05132282, + "balance_loss_mlp": 1.02885818, + "epoch": 0.33343353574219925, + "flos": 28001596930560.0, + "grad_norm": 2.8438995920134076, + "language_loss": 0.8049711, + "learning_rate": 3.109601724141946e-06, + "loss": 0.8268829, + "num_input_tokens_seen": 59791230, + "step": 2773, + "time_per_iteration": 2.547266721725464 + }, + { + "auxiliary_loss_clip": 0.01161656, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.05396748, + "balance_loss_mlp": 1.02053893, + "epoch": 0.33355377863283836, + "flos": 23764963582080.0, + "grad_norm": 1.7184250157778536, + "language_loss": 0.68006015, + "learning_rate": 3.108953548691138e-06, + "loss": 0.70196861, + "num_input_tokens_seen": 59811315, + "step": 2774, + "time_per_iteration": 2.53174090385437 + }, + { + "auxiliary_loss_clip": 0.01195884, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.05773067, + "balance_loss_mlp": 1.02047515, + "epoch": 0.3336740215234774, + "flos": 37779677078400.0, + "grad_norm": 2.1293331143453287, + "language_loss": 0.72533524, + "learning_rate": 3.108305205012959e-06, + "loss": 0.74759054, + "num_input_tokens_seen": 59832010, + "step": 2775, + "time_per_iteration": 2.579028844833374 + }, + { + "auxiliary_loss_clip": 0.01161332, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.05292702, + "balance_loss_mlp": 1.02032089, + "epoch": 0.3337942644141165, + "flos": 25519056347520.0, + "grad_norm": 2.3667259888482577, + "language_loss": 0.87442344, + "learning_rate": 3.107656693205761e-06, + "loss": 0.89633477, + "num_input_tokens_seen": 59851450, + "step": 2776, + "time_per_iteration": 2.5330100059509277 + }, + { + "auxiliary_loss_clip": 0.01197016, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.05699849, + "balance_loss_mlp": 1.0258956, + "epoch": 0.3339145073047556, + "flos": 25989844930560.0, + "grad_norm": 2.9084244473816225, + "language_loss": 0.70411718, + "learning_rate": 3.107008013367924e-06, + "loss": 0.72644442, + "num_input_tokens_seen": 59870245, + "step": 2777, + "time_per_iteration": 2.4657881259918213 + }, + { + "auxiliary_loss_clip": 0.01148842, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.05172324, + "balance_loss_mlp": 1.02202642, + "epoch": 0.3340347501953947, + "flos": 19062569554560.0, + "grad_norm": 2.0964109295983344, + "language_loss": 0.86988473, + "learning_rate": 3.1063591655978507e-06, + "loss": 0.89168835, + "num_input_tokens_seen": 59886195, + "step": 2778, + "time_per_iteration": 2.5176336765289307 + }, + { + "auxiliary_loss_clip": 0.01122109, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.0455277, + "balance_loss_mlp": 1.02304578, + "epoch": 0.3341549930860338, + "flos": 18109715518080.0, + "grad_norm": 1.728711784970176, + "language_loss": 0.79506189, + "learning_rate": 3.105710149993972e-06, + "loss": 0.81660682, + "num_input_tokens_seen": 59905525, + "step": 2779, + "time_per_iteration": 2.5531671047210693 + }, + { + "auxiliary_loss_clip": 0.01196644, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.0569284, + "balance_loss_mlp": 1.02102017, + "epoch": 0.33427523597667286, + "flos": 22674967418880.0, + "grad_norm": 1.7292401895339276, + "language_loss": 0.85241753, + "learning_rate": 3.1050609666547427e-06, + "loss": 0.87468749, + "num_input_tokens_seen": 59925085, + "step": 2780, + "time_per_iteration": 2.4527883529663086 + }, + { + "auxiliary_loss_clip": 0.01158706, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.05448353, + "balance_loss_mlp": 1.03050768, + "epoch": 0.33439547886731197, + "flos": 22638338524800.0, + "grad_norm": 1.8283661288281738, + "language_loss": 0.77297407, + "learning_rate": 3.104411615678644e-06, + "loss": 0.79495561, + "num_input_tokens_seen": 59943935, + "step": 2781, + "time_per_iteration": 2.5432119369506836 + }, + { + "auxiliary_loss_clip": 0.0116099, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.05300725, + "balance_loss_mlp": 1.02479994, + "epoch": 0.3345157217579511, + "flos": 24096383395200.0, + "grad_norm": 4.294028980692174, + "language_loss": 0.73754764, + "learning_rate": 3.1037620971641803e-06, + "loss": 0.75950515, + "num_input_tokens_seen": 59963725, + "step": 2782, + "time_per_iteration": 2.5091447830200195 + }, + { + "auxiliary_loss_clip": 0.0119616, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.05814433, + "balance_loss_mlp": 1.02685869, + "epoch": 0.33463596464859013, + "flos": 18989491334400.0, + "grad_norm": 2.1889981021101548, + "language_loss": 0.64687735, + "learning_rate": 3.1031124112098844e-06, + "loss": 0.66920185, + "num_input_tokens_seen": 59981935, + "step": 2783, + "time_per_iteration": 2.4288344383239746 + }, + { + "auxiliary_loss_clip": 0.0116814, + "auxiliary_loss_mlp": 0.01026885, + "balance_loss_clip": 1.05526125, + "balance_loss_mlp": 1.01790881, + "epoch": 0.33475620753922924, + "flos": 20375607219840.0, + "grad_norm": 2.059659066138216, + "language_loss": 0.72374785, + "learning_rate": 3.1024625579143127e-06, + "loss": 0.74569809, + "num_input_tokens_seen": 59999455, + "step": 2784, + "time_per_iteration": 2.489598035812378 + }, + { + "auxiliary_loss_clip": 0.01193086, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.05673635, + "balance_loss_mlp": 1.02617896, + "epoch": 0.33487645042986836, + "flos": 18182578256640.0, + "grad_norm": 1.808290904659665, + "language_loss": 0.73041236, + "learning_rate": 3.101812537376048e-06, + "loss": 0.75269663, + "num_input_tokens_seen": 60018475, + "step": 2785, + "time_per_iteration": 2.443201780319214 + }, + { + "auxiliary_loss_clip": 0.01158049, + "auxiliary_loss_mlp": 0.00763856, + "balance_loss_clip": 1.05207539, + "balance_loss_mlp": 1.00059342, + "epoch": 0.3349966933205074, + "flos": 25848824135040.0, + "grad_norm": 1.9729312251774827, + "language_loss": 0.84468871, + "learning_rate": 3.1011623496936973e-06, + "loss": 0.86390775, + "num_input_tokens_seen": 60036770, + "step": 2786, + "time_per_iteration": 2.5408260822296143 + }, + { + "auxiliary_loss_clip": 0.01193375, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.05780411, + "balance_loss_mlp": 1.02285278, + "epoch": 0.3351169362111465, + "flos": 28111447699200.0, + "grad_norm": 1.7474483151917581, + "language_loss": 0.6976797, + "learning_rate": 3.100511994965893e-06, + "loss": 0.71993011, + "num_input_tokens_seen": 60056725, + "step": 2787, + "time_per_iteration": 2.5296106338500977 + }, + { + "auxiliary_loss_clip": 0.01174502, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.05610752, + "balance_loss_mlp": 1.02318156, + "epoch": 0.33523717910178563, + "flos": 22673315393280.0, + "grad_norm": 1.7022789766044617, + "language_loss": 0.84408545, + "learning_rate": 3.0998614732912947e-06, + "loss": 0.86615229, + "num_input_tokens_seen": 60076100, + "step": 2788, + "time_per_iteration": 2.4613378047943115 + }, + { + "auxiliary_loss_clip": 0.01181942, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.05838513, + "balance_loss_mlp": 1.02564025, + "epoch": 0.3353574219924247, + "flos": 15669801400320.0, + "grad_norm": 2.4432268464487645, + "language_loss": 0.67808342, + "learning_rate": 3.0992107847685855e-06, + "loss": 0.70025182, + "num_input_tokens_seen": 60093815, + "step": 2789, + "time_per_iteration": 3.2646031379699707 + }, + { + "auxiliary_loss_clip": 0.01171927, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.05956745, + "balance_loss_mlp": 1.0304513, + "epoch": 0.3354776648830638, + "flos": 24790644443520.0, + "grad_norm": 1.5583806863305008, + "language_loss": 0.79321051, + "learning_rate": 3.0985599294964736e-06, + "loss": 0.81532985, + "num_input_tokens_seen": 60113370, + "step": 2790, + "time_per_iteration": 2.546794891357422 + }, + { + "auxiliary_loss_clip": 0.01162839, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.05497122, + "balance_loss_mlp": 1.02819812, + "epoch": 0.33559790777370285, + "flos": 28694852398080.0, + "grad_norm": 2.081542223193698, + "language_loss": 0.70130706, + "learning_rate": 3.097908907573695e-06, + "loss": 0.72332573, + "num_input_tokens_seen": 60131350, + "step": 2791, + "time_per_iteration": 2.5969111919403076 + }, + { + "auxiliary_loss_clip": 0.01124099, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.05260944, + "balance_loss_mlp": 1.02872729, + "epoch": 0.33571815066434196, + "flos": 22235779825920.0, + "grad_norm": 1.9882523961761898, + "language_loss": 0.89591956, + "learning_rate": 3.0972577190990067e-06, + "loss": 0.91753638, + "num_input_tokens_seen": 60149830, + "step": 2792, + "time_per_iteration": 2.596595525741577 + }, + { + "auxiliary_loss_clip": 0.01156373, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.0543313, + "balance_loss_mlp": 1.02497685, + "epoch": 0.3358383935549811, + "flos": 23842279607040.0, + "grad_norm": 1.7828988341915757, + "language_loss": 0.80069876, + "learning_rate": 3.096606364171196e-06, + "loss": 0.82260156, + "num_input_tokens_seen": 60169620, + "step": 2793, + "time_per_iteration": 2.5602777004241943 + }, + { + "auxiliary_loss_clip": 0.01135225, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.04930151, + "balance_loss_mlp": 1.02370477, + "epoch": 0.33595863644562013, + "flos": 22267308988800.0, + "grad_norm": 2.3224747717232224, + "language_loss": 0.85101867, + "learning_rate": 3.0959548428890703e-06, + "loss": 0.87270457, + "num_input_tokens_seen": 60188490, + "step": 2794, + "time_per_iteration": 2.544663429260254 + }, + { + "auxiliary_loss_clip": 0.01177984, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.05831838, + "balance_loss_mlp": 1.03011346, + "epoch": 0.33607887933625924, + "flos": 20119779578880.0, + "grad_norm": 1.5196336030087647, + "language_loss": 0.84115517, + "learning_rate": 3.095303155351468e-06, + "loss": 0.86332953, + "num_input_tokens_seen": 60208695, + "step": 2795, + "time_per_iteration": 3.3538596630096436 + }, + { + "auxiliary_loss_clip": 0.01132007, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.05377054, + "balance_loss_mlp": 1.02703881, + "epoch": 0.33619912222689835, + "flos": 19318109886720.0, + "grad_norm": 2.175559617833444, + "language_loss": 0.79453677, + "learning_rate": 3.0946513016572464e-06, + "loss": 0.81621623, + "num_input_tokens_seen": 60227600, + "step": 2796, + "time_per_iteration": 3.410482883453369 + }, + { + "auxiliary_loss_clip": 0.01184565, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.05628538, + "balance_loss_mlp": 1.02425289, + "epoch": 0.3363193651175374, + "flos": 16800664262400.0, + "grad_norm": 3.020137812732992, + "language_loss": 0.76981485, + "learning_rate": 3.0939992819052938e-06, + "loss": 0.79199839, + "num_input_tokens_seen": 60245110, + "step": 2797, + "time_per_iteration": 2.4818058013916016 + }, + { + "auxiliary_loss_clip": 0.01169443, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.0585146, + "balance_loss_mlp": 1.02157736, + "epoch": 0.3364396080081765, + "flos": 23550289948800.0, + "grad_norm": 2.0811658074619106, + "language_loss": 0.81283718, + "learning_rate": 3.0933470961945193e-06, + "loss": 0.83484143, + "num_input_tokens_seen": 60263405, + "step": 2798, + "time_per_iteration": 2.548593521118164 + }, + { + "auxiliary_loss_clip": 0.01164715, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.05844295, + "balance_loss_mlp": 1.02821207, + "epoch": 0.3365598508988156, + "flos": 28037902602240.0, + "grad_norm": 1.5933206515666067, + "language_loss": 0.68220854, + "learning_rate": 3.0926947446238597e-06, + "loss": 0.70422554, + "num_input_tokens_seen": 60282975, + "step": 2799, + "time_per_iteration": 2.603602886199951 + }, + { + "auxiliary_loss_clip": 0.01183793, + "auxiliary_loss_mlp": 0.01035064, + "balance_loss_clip": 1.05471015, + "balance_loss_mlp": 1.0252049, + "epoch": 0.3366800937894547, + "flos": 16982767238400.0, + "grad_norm": 2.39436592149514, + "language_loss": 0.82461476, + "learning_rate": 3.092042227292276e-06, + "loss": 0.84680331, + "num_input_tokens_seen": 60299810, + "step": 2800, + "time_per_iteration": 2.5505878925323486 + }, + { + "auxiliary_loss_clip": 0.01191723, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.05854976, + "balance_loss_mlp": 1.02246976, + "epoch": 0.3368003366800938, + "flos": 23915321913600.0, + "grad_norm": 1.5730277935919512, + "language_loss": 0.88006878, + "learning_rate": 3.0913895442987557e-06, + "loss": 0.90229547, + "num_input_tokens_seen": 60320775, + "step": 2801, + "time_per_iteration": 2.4913220405578613 + }, + { + "auxiliary_loss_clip": 0.01153269, + "auxiliary_loss_mlp": 0.00764059, + "balance_loss_clip": 1.05524945, + "balance_loss_mlp": 1.00052059, + "epoch": 0.3369205795707329, + "flos": 24791219061120.0, + "grad_norm": 1.5405413196578854, + "language_loss": 0.86292446, + "learning_rate": 3.090736695742308e-06, + "loss": 0.88209772, + "num_input_tokens_seen": 60341905, + "step": 2802, + "time_per_iteration": 2.5626161098480225 + }, + { + "auxiliary_loss_clip": 0.01131308, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.05089545, + "balance_loss_mlp": 1.02376688, + "epoch": 0.33704082246137196, + "flos": 17931096161280.0, + "grad_norm": 2.2596717581977095, + "language_loss": 0.52499092, + "learning_rate": 3.0900836817219713e-06, + "loss": 0.5466271, + "num_input_tokens_seen": 60358335, + "step": 2803, + "time_per_iteration": 2.552642583847046 + }, + { + "auxiliary_loss_clip": 0.01192919, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.05703592, + "balance_loss_mlp": 1.02365518, + "epoch": 0.33716106535201107, + "flos": 21286517149440.0, + "grad_norm": 1.6339236632469183, + "language_loss": 0.83554125, + "learning_rate": 3.089430502336807e-06, + "loss": 0.85779238, + "num_input_tokens_seen": 60378305, + "step": 2804, + "time_per_iteration": 2.4783382415771484 + }, + { + "auxiliary_loss_clip": 0.01183403, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.05715764, + "balance_loss_mlp": 1.02036202, + "epoch": 0.3372813082426502, + "flos": 18402962152320.0, + "grad_norm": 3.299616317513142, + "language_loss": 0.89962888, + "learning_rate": 3.088777157685902e-06, + "loss": 0.92176002, + "num_input_tokens_seen": 60393895, + "step": 2805, + "time_per_iteration": 2.476597547531128 + }, + { + "auxiliary_loss_clip": 0.01162708, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.05594015, + "balance_loss_mlp": 1.01829433, + "epoch": 0.33740155113328923, + "flos": 17201391367680.0, + "grad_norm": 1.8817690805679947, + "language_loss": 0.85918957, + "learning_rate": 3.088123647868367e-06, + "loss": 0.88108909, + "num_input_tokens_seen": 60410445, + "step": 2806, + "time_per_iteration": 2.489069700241089 + }, + { + "auxiliary_loss_clip": 0.01184795, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.05695033, + "balance_loss_mlp": 1.02505589, + "epoch": 0.33752179402392835, + "flos": 29058950609280.0, + "grad_norm": 1.841258936596911, + "language_loss": 0.81294703, + "learning_rate": 3.0874699729833405e-06, + "loss": 0.83513343, + "num_input_tokens_seen": 60431815, + "step": 2807, + "time_per_iteration": 2.571648120880127 + }, + { + "auxiliary_loss_clip": 0.01162052, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.05567408, + "balance_loss_mlp": 1.01874042, + "epoch": 0.3376420369145674, + "flos": 25080730680960.0, + "grad_norm": 1.5942560599041764, + "language_loss": 0.79916126, + "learning_rate": 3.086816133129983e-06, + "loss": 0.82106173, + "num_input_tokens_seen": 60452075, + "step": 2808, + "time_per_iteration": 2.542128801345825 + }, + { + "auxiliary_loss_clip": 0.01195746, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.06124699, + "balance_loss_mlp": 1.01963139, + "epoch": 0.3377622798052065, + "flos": 27490624007040.0, + "grad_norm": 1.6209068587423892, + "language_loss": 0.76131868, + "learning_rate": 3.0861621284074826e-06, + "loss": 0.78356069, + "num_input_tokens_seen": 60472600, + "step": 2809, + "time_per_iteration": 2.5211522579193115 + }, + { + "auxiliary_loss_clip": 0.01173977, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.05811119, + "balance_loss_mlp": 1.02384448, + "epoch": 0.3378825226958456, + "flos": 21975211589760.0, + "grad_norm": 1.4633976848118593, + "language_loss": 0.73133975, + "learning_rate": 3.085507958915051e-06, + "loss": 0.75340056, + "num_input_tokens_seen": 60491030, + "step": 2810, + "time_per_iteration": 2.4954710006713867 + }, + { + "auxiliary_loss_clip": 0.0116423, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.05806351, + "balance_loss_mlp": 1.02296162, + "epoch": 0.3380027655864847, + "flos": 42523189200000.0, + "grad_norm": 1.9593642710409174, + "language_loss": 0.71038061, + "learning_rate": 3.084853624751925e-06, + "loss": 0.73234916, + "num_input_tokens_seen": 60512615, + "step": 2811, + "time_per_iteration": 2.677703857421875 + }, + { + "auxiliary_loss_clip": 0.01156172, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.05782747, + "balance_loss_mlp": 1.02499461, + "epoch": 0.3381230084771238, + "flos": 26725080418560.0, + "grad_norm": 1.6767827979958827, + "language_loss": 0.85907954, + "learning_rate": 3.0841991260173668e-06, + "loss": 0.88098156, + "num_input_tokens_seen": 60532520, + "step": 2812, + "time_per_iteration": 2.593587875366211 + }, + { + "auxiliary_loss_clip": 0.01198465, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.0609982, + "balance_loss_mlp": 1.02039814, + "epoch": 0.3382432513677629, + "flos": 22710375250560.0, + "grad_norm": 1.826504349550865, + "language_loss": 0.80315924, + "learning_rate": 3.0835444628106634e-06, + "loss": 0.82544374, + "num_input_tokens_seen": 60551500, + "step": 2813, + "time_per_iteration": 2.4562697410583496 + }, + { + "auxiliary_loss_clip": 0.01193363, + "auxiliary_loss_mlp": 0.00764117, + "balance_loss_clip": 1.058496, + "balance_loss_mlp": 1.00055718, + "epoch": 0.33836349425840195, + "flos": 22122409524480.0, + "grad_norm": 1.7304601418190944, + "language_loss": 0.8320021, + "learning_rate": 3.082889635231126e-06, + "loss": 0.85157686, + "num_input_tokens_seen": 60570160, + "step": 2814, + "time_per_iteration": 2.4557278156280518 + }, + { + "auxiliary_loss_clip": 0.01168663, + "auxiliary_loss_mlp": 0.01028183, + "balance_loss_clip": 1.05481958, + "balance_loss_mlp": 1.01848519, + "epoch": 0.33848373714904106, + "flos": 27308090067840.0, + "grad_norm": 2.672041583900298, + "language_loss": 0.76411843, + "learning_rate": 3.0822346433780925e-06, + "loss": 0.78608686, + "num_input_tokens_seen": 60590885, + "step": 2815, + "time_per_iteration": 2.65100359916687 + }, + { + "auxiliary_loss_clip": 0.01179063, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.05459321, + "balance_loss_mlp": 1.01936984, + "epoch": 0.3386039800396802, + "flos": 25848716394240.0, + "grad_norm": 1.9189750251488258, + "language_loss": 0.87047362, + "learning_rate": 3.0815794873509237e-06, + "loss": 0.89255226, + "num_input_tokens_seen": 60609170, + "step": 2816, + "time_per_iteration": 3.373980760574341 + }, + { + "auxiliary_loss_clip": 0.01196763, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.06155539, + "balance_loss_mlp": 1.02186131, + "epoch": 0.33872422293031923, + "flos": 18880646146560.0, + "grad_norm": 1.7859634282738086, + "language_loss": 0.73045981, + "learning_rate": 3.0809241672490066e-06, + "loss": 0.75273609, + "num_input_tokens_seen": 60627340, + "step": 2817, + "time_per_iteration": 2.4223473072052 + }, + { + "auxiliary_loss_clip": 0.01170535, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.05873156, + "balance_loss_mlp": 1.01927888, + "epoch": 0.33884446582095834, + "flos": 23146977064320.0, + "grad_norm": 1.6399070242588367, + "language_loss": 0.84991193, + "learning_rate": 3.080268683171753e-06, + "loss": 0.87189746, + "num_input_tokens_seen": 60647630, + "step": 2818, + "time_per_iteration": 2.541529893875122 + }, + { + "auxiliary_loss_clip": 0.01178853, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.05573595, + "balance_loss_mlp": 1.02034044, + "epoch": 0.33896470871159745, + "flos": 15997342544640.0, + "grad_norm": 3.465952927681238, + "language_loss": 0.89056385, + "learning_rate": 3.0796130352185985e-06, + "loss": 0.91263998, + "num_input_tokens_seen": 60664485, + "step": 2819, + "time_per_iteration": 2.4549951553344727 + }, + { + "auxiliary_loss_clip": 0.0115203, + "auxiliary_loss_mlp": 0.00764486, + "balance_loss_clip": 1.05051506, + "balance_loss_mlp": 1.00056481, + "epoch": 0.3390849516022365, + "flos": 34495754112000.0, + "grad_norm": 1.7206782321086551, + "language_loss": 0.66293436, + "learning_rate": 3.0789572234890057e-06, + "loss": 0.68209958, + "num_input_tokens_seen": 60686125, + "step": 2820, + "time_per_iteration": 2.6476056575775146 + }, + { + "auxiliary_loss_clip": 0.01168583, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.05967259, + "balance_loss_mlp": 1.02205217, + "epoch": 0.3392051944928756, + "flos": 16180307447040.0, + "grad_norm": 1.5980720703876337, + "language_loss": 0.77722597, + "learning_rate": 3.0783012480824596e-06, + "loss": 0.79922712, + "num_input_tokens_seen": 60705270, + "step": 2821, + "time_per_iteration": 3.373331308364868 + }, + { + "auxiliary_loss_clip": 0.01192974, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.05705953, + "balance_loss_mlp": 1.02707541, + "epoch": 0.33932543738351467, + "flos": 17086656349440.0, + "grad_norm": 2.05781483801845, + "language_loss": 0.74176073, + "learning_rate": 3.077645109098471e-06, + "loss": 0.7640487, + "num_input_tokens_seen": 60721540, + "step": 2822, + "time_per_iteration": 3.177468776702881 + }, + { + "auxiliary_loss_clip": 0.01138654, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.05350184, + "balance_loss_mlp": 1.02254176, + "epoch": 0.3394456802741538, + "flos": 22126970551680.0, + "grad_norm": 1.8771677630903518, + "language_loss": 0.72189283, + "learning_rate": 3.076988806636577e-06, + "loss": 0.74359167, + "num_input_tokens_seen": 60739300, + "step": 2823, + "time_per_iteration": 3.2664270401000977 + }, + { + "auxiliary_loss_clip": 0.01171885, + "auxiliary_loss_mlp": 0.00764276, + "balance_loss_clip": 1.05893779, + "balance_loss_mlp": 1.00064969, + "epoch": 0.3395659231647929, + "flos": 25226887121280.0, + "grad_norm": 1.8426482199465122, + "language_loss": 0.88545227, + "learning_rate": 3.0763323407963377e-06, + "loss": 0.90481389, + "num_input_tokens_seen": 60758910, + "step": 2824, + "time_per_iteration": 2.5418875217437744 + }, + { + "auxiliary_loss_clip": 0.01177069, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.0530386, + "balance_loss_mlp": 1.02266824, + "epoch": 0.33968616605543195, + "flos": 29096477343360.0, + "grad_norm": 1.7419393378233285, + "language_loss": 0.80139029, + "learning_rate": 3.075675711677337e-06, + "loss": 0.82346797, + "num_input_tokens_seen": 60779005, + "step": 2825, + "time_per_iteration": 2.550868034362793 + }, + { + "auxiliary_loss_clip": 0.01161744, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.05785692, + "balance_loss_mlp": 1.02902889, + "epoch": 0.33980640894607106, + "flos": 21433966479360.0, + "grad_norm": 1.875569005835158, + "language_loss": 0.77708316, + "learning_rate": 3.0750189193791865e-06, + "loss": 0.7990846, + "num_input_tokens_seen": 60798590, + "step": 2826, + "time_per_iteration": 2.492832660675049 + }, + { + "auxiliary_loss_clip": 0.01177994, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.05595779, + "balance_loss_mlp": 1.01919031, + "epoch": 0.33992665183671017, + "flos": 32490035596800.0, + "grad_norm": 2.25449747897589, + "language_loss": 0.70592326, + "learning_rate": 3.0743619640015203e-06, + "loss": 0.72798419, + "num_input_tokens_seen": 60818840, + "step": 2827, + "time_per_iteration": 2.5631790161132812 + }, + { + "auxiliary_loss_clip": 0.0116979, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.05369449, + "balance_loss_mlp": 1.02279806, + "epoch": 0.3400468947273492, + "flos": 17055414495360.0, + "grad_norm": 3.858596552100289, + "language_loss": 0.92709273, + "learning_rate": 3.073704845643999e-06, + "loss": 0.9491148, + "num_input_tokens_seen": 60835965, + "step": 2828, + "time_per_iteration": 2.5107100009918213 + }, + { + "auxiliary_loss_clip": 0.01181236, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.05490315, + "balance_loss_mlp": 1.02958906, + "epoch": 0.34016713761798834, + "flos": 16872988296960.0, + "grad_norm": 2.7177534372300367, + "language_loss": 0.77822864, + "learning_rate": 3.0730475644063063e-06, + "loss": 0.80043423, + "num_input_tokens_seen": 60851065, + "step": 2829, + "time_per_iteration": 2.4641716480255127 + }, + { + "auxiliary_loss_clip": 0.01156822, + "auxiliary_loss_mlp": 0.00763451, + "balance_loss_clip": 1.05148387, + "balance_loss_mlp": 1.00060332, + "epoch": 0.34028738050862745, + "flos": 21907161273600.0, + "grad_norm": 1.6332431121648796, + "language_loss": 0.64891446, + "learning_rate": 3.072390120388151e-06, + "loss": 0.66811717, + "num_input_tokens_seen": 60869390, + "step": 2830, + "time_per_iteration": 2.4921460151672363 + }, + { + "auxiliary_loss_clip": 0.01181327, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.05773854, + "balance_loss_mlp": 1.02050829, + "epoch": 0.3404076233992665, + "flos": 22746034477440.0, + "grad_norm": 2.7823955540506313, + "language_loss": 0.70972896, + "learning_rate": 3.071732513689267e-06, + "loss": 0.73184162, + "num_input_tokens_seen": 60887925, + "step": 2831, + "time_per_iteration": 2.477276563644409 + }, + { + "auxiliary_loss_clip": 0.01183593, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.06125474, + "balance_loss_mlp": 1.02537358, + "epoch": 0.3405278662899056, + "flos": 17052361839360.0, + "grad_norm": 2.119739785314952, + "language_loss": 0.6717006, + "learning_rate": 3.0710747444094134e-06, + "loss": 0.69388211, + "num_input_tokens_seen": 60905955, + "step": 2832, + "time_per_iteration": 2.439854621887207 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.05697453, + "balance_loss_mlp": 1.02399278, + "epoch": 0.3406481091805447, + "flos": 42813131783040.0, + "grad_norm": 1.7877636726489856, + "language_loss": 0.64813149, + "learning_rate": 3.070416812648372e-06, + "loss": 0.67014909, + "num_input_tokens_seen": 60929405, + "step": 2833, + "time_per_iteration": 2.6894752979278564 + }, + { + "auxiliary_loss_clip": 0.01147064, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.04871488, + "balance_loss_mlp": 1.02310157, + "epoch": 0.3407683520711838, + "flos": 26761457917440.0, + "grad_norm": 2.0562421399888353, + "language_loss": 0.65065026, + "learning_rate": 3.069758718505951e-06, + "loss": 0.67243946, + "num_input_tokens_seen": 60951145, + "step": 2834, + "time_per_iteration": 2.5993926525115967 + }, + { + "auxiliary_loss_clip": 0.01194028, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.05948353, + "balance_loss_mlp": 1.02622747, + "epoch": 0.3408885949618229, + "flos": 28767643309440.0, + "grad_norm": 1.5872817200681593, + "language_loss": 0.79973483, + "learning_rate": 3.0691004620819836e-06, + "loss": 0.82203031, + "num_input_tokens_seen": 60971275, + "step": 2835, + "time_per_iteration": 2.5071046352386475 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01001519, + "balance_loss_clip": 1.02581668, + "balance_loss_mlp": 1.00033307, + "epoch": 0.341008837852462, + "flos": 63576252881280.0, + "grad_norm": 0.794941977036189, + "language_loss": 0.60230374, + "learning_rate": 3.0684420434763254e-06, + "loss": 0.62279606, + "num_input_tokens_seen": 61037460, + "step": 2836, + "time_per_iteration": 3.1616110801696777 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.05393887, + "balance_loss_mlp": 1.02721214, + "epoch": 0.34112908074310105, + "flos": 20812173120000.0, + "grad_norm": 1.875328642869392, + "language_loss": 0.77162451, + "learning_rate": 3.06778346278886e-06, + "loss": 0.79339111, + "num_input_tokens_seen": 61056295, + "step": 2837, + "time_per_iteration": 2.584921360015869 + }, + { + "auxiliary_loss_clip": 0.01196858, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.06109297, + "balance_loss_mlp": 1.01985598, + "epoch": 0.34124932363374016, + "flos": 24976446520320.0, + "grad_norm": 1.6778330485733912, + "language_loss": 0.78711438, + "learning_rate": 3.0671247201194906e-06, + "loss": 0.80937111, + "num_input_tokens_seen": 61078430, + "step": 2838, + "time_per_iteration": 2.5037589073181152 + }, + { + "auxiliary_loss_clip": 0.01150946, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.05079067, + "balance_loss_mlp": 1.02421236, + "epoch": 0.3413695665243792, + "flos": 28402970480640.0, + "grad_norm": 1.7403362728454987, + "language_loss": 0.75793797, + "learning_rate": 3.066465815568151e-06, + "loss": 0.77977693, + "num_input_tokens_seen": 61099260, + "step": 2839, + "time_per_iteration": 2.672842264175415 + }, + { + "auxiliary_loss_clip": 0.01179379, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.05462503, + "balance_loss_mlp": 1.01860511, + "epoch": 0.34148980941501833, + "flos": 25302012416640.0, + "grad_norm": 1.799863540739093, + "language_loss": 0.69140643, + "learning_rate": 3.0658067492347947e-06, + "loss": 0.7134704, + "num_input_tokens_seen": 61121900, + "step": 2840, + "time_per_iteration": 2.557950258255005 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.04898286, + "balance_loss_mlp": 1.0218823, + "epoch": 0.34161005230565744, + "flos": 17530081747200.0, + "grad_norm": 3.3886022029106737, + "language_loss": 0.66399121, + "learning_rate": 3.065147521219402e-06, + "loss": 0.68532276, + "num_input_tokens_seen": 61141155, + "step": 2841, + "time_per_iteration": 2.627209186553955 + }, + { + "auxiliary_loss_clip": 0.01159118, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.05848575, + "balance_loss_mlp": 1.02666676, + "epoch": 0.3417302951962965, + "flos": 43650101566080.0, + "grad_norm": 1.4637670655832455, + "language_loss": 0.74448383, + "learning_rate": 3.064488131621977e-06, + "loss": 0.7664308, + "num_input_tokens_seen": 61164480, + "step": 2842, + "time_per_iteration": 2.6990067958831787 + }, + { + "auxiliary_loss_clip": 0.01171966, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.05330873, + "balance_loss_mlp": 1.02562118, + "epoch": 0.3418505380869356, + "flos": 30882207012480.0, + "grad_norm": 1.8366930298179058, + "language_loss": 0.73696601, + "learning_rate": 3.063828580542549e-06, + "loss": 0.75903594, + "num_input_tokens_seen": 61185675, + "step": 2843, + "time_per_iteration": 3.413469076156616 + }, + { + "auxiliary_loss_clip": 0.01162901, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.05430579, + "balance_loss_mlp": 1.02448463, + "epoch": 0.3419707809775747, + "flos": 19463871277440.0, + "grad_norm": 3.4930476029731596, + "language_loss": 0.73393798, + "learning_rate": 3.0631688680811706e-06, + "loss": 0.75589663, + "num_input_tokens_seen": 61205300, + "step": 2844, + "time_per_iteration": 2.522000312805176 + }, + { + "auxiliary_loss_clip": 0.01195029, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.0588218, + "balance_loss_mlp": 1.02940512, + "epoch": 0.3420910238682138, + "flos": 28727818104960.0, + "grad_norm": 2.5380852845665056, + "language_loss": 0.75948596, + "learning_rate": 3.062508994337921e-06, + "loss": 0.7818197, + "num_input_tokens_seen": 61224905, + "step": 2845, + "time_per_iteration": 2.5299503803253174 + }, + { + "auxiliary_loss_clip": 0.01179607, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.05426919, + "balance_loss_mlp": 1.02167535, + "epoch": 0.3422112667588529, + "flos": 21397265758080.0, + "grad_norm": 1.9335868565547332, + "language_loss": 0.79656082, + "learning_rate": 3.0618489594129013e-06, + "loss": 0.81866395, + "num_input_tokens_seen": 61243045, + "step": 2846, + "time_per_iteration": 2.472214460372925 + }, + { + "auxiliary_loss_clip": 0.01156333, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.05567932, + "balance_loss_mlp": 1.02195549, + "epoch": 0.342331509649492, + "flos": 13881450038400.0, + "grad_norm": 2.8792717115825157, + "language_loss": 0.71314698, + "learning_rate": 3.061188763406239e-06, + "loss": 0.73501229, + "num_input_tokens_seen": 61259190, + "step": 2847, + "time_per_iteration": 2.5289528369903564 + }, + { + "auxiliary_loss_clip": 0.01159031, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.05303383, + "balance_loss_mlp": 1.02731848, + "epoch": 0.34245175254013105, + "flos": 28621450955520.0, + "grad_norm": 2.0738726317178307, + "language_loss": 0.82062089, + "learning_rate": 3.060528406418085e-06, + "loss": 0.84257674, + "num_input_tokens_seen": 61279040, + "step": 2848, + "time_per_iteration": 3.411067485809326 + }, + { + "auxiliary_loss_clip": 0.01156205, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.05335414, + "balance_loss_mlp": 1.02458823, + "epoch": 0.34257199543077016, + "flos": 34127058960000.0, + "grad_norm": 1.7329179094990828, + "language_loss": 0.61690629, + "learning_rate": 3.0598678885486145e-06, + "loss": 0.63879555, + "num_input_tokens_seen": 61301580, + "step": 2849, + "time_per_iteration": 4.286153793334961 + }, + { + "auxiliary_loss_clip": 0.01152153, + "auxiliary_loss_mlp": 0.00763574, + "balance_loss_clip": 1.05148506, + "balance_loss_mlp": 1.00064421, + "epoch": 0.34269223832140927, + "flos": 19974018188160.0, + "grad_norm": 1.7104306537581189, + "language_loss": 0.74597412, + "learning_rate": 3.0592072098980282e-06, + "loss": 0.76513135, + "num_input_tokens_seen": 61321240, + "step": 2850, + "time_per_iteration": 2.5296194553375244 + }, + { + "auxiliary_loss_clip": 0.01156511, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.05220044, + "balance_loss_mlp": 1.02336204, + "epoch": 0.3428124812120483, + "flos": 27235658292480.0, + "grad_norm": 2.0927009317449117, + "language_loss": 0.72429907, + "learning_rate": 3.0585463705665514e-06, + "loss": 0.7461887, + "num_input_tokens_seen": 61341615, + "step": 2851, + "time_per_iteration": 2.558152437210083 + }, + { + "auxiliary_loss_clip": 0.01147678, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.05035079, + "balance_loss_mlp": 1.02469015, + "epoch": 0.34293272410268744, + "flos": 24570871079040.0, + "grad_norm": 2.3775123087698216, + "language_loss": 0.7083534, + "learning_rate": 3.0578853706544304e-06, + "loss": 0.73016202, + "num_input_tokens_seen": 61359005, + "step": 2852, + "time_per_iteration": 2.5719010829925537 + }, + { + "auxiliary_loss_clip": 0.01151815, + "auxiliary_loss_mlp": 0.00764177, + "balance_loss_clip": 1.05299461, + "balance_loss_mlp": 1.00068247, + "epoch": 0.34305296699332655, + "flos": 21506865131520.0, + "grad_norm": 2.533326538591344, + "language_loss": 0.65476978, + "learning_rate": 3.0572242102619404e-06, + "loss": 0.67392975, + "num_input_tokens_seen": 61376160, + "step": 2853, + "time_per_iteration": 2.5469963550567627 + }, + { + "auxiliary_loss_clip": 0.01161826, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.05493999, + "balance_loss_mlp": 1.02486849, + "epoch": 0.3431732098839656, + "flos": 24056665931520.0, + "grad_norm": 1.7055883971682353, + "language_loss": 0.80535883, + "learning_rate": 3.0565628894893784e-06, + "loss": 0.82731307, + "num_input_tokens_seen": 61396795, + "step": 2854, + "time_per_iteration": 2.5753142833709717 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.05597413, + "balance_loss_mlp": 1.0231812, + "epoch": 0.3432934527746047, + "flos": 16800879744000.0, + "grad_norm": 1.6235873889745867, + "language_loss": 0.74764627, + "learning_rate": 3.0559014084370655e-06, + "loss": 0.76967728, + "num_input_tokens_seen": 61415320, + "step": 2855, + "time_per_iteration": 2.5247013568878174 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.05393612, + "balance_loss_mlp": 1.02011454, + "epoch": 0.34341369566524377, + "flos": 23439720908160.0, + "grad_norm": 1.6235014409123967, + "language_loss": 0.78580523, + "learning_rate": 3.055239767205349e-06, + "loss": 0.80778426, + "num_input_tokens_seen": 61437070, + "step": 2856, + "time_per_iteration": 2.5534403324127197 + }, + { + "auxiliary_loss_clip": 0.01179992, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.0611037, + "balance_loss_mlp": 1.02428365, + "epoch": 0.3435339385558829, + "flos": 17267466435840.0, + "grad_norm": 1.8040581741170942, + "language_loss": 0.78219175, + "learning_rate": 3.054577965894599e-06, + "loss": 0.80432272, + "num_input_tokens_seen": 61453215, + "step": 2857, + "time_per_iteration": 2.477968454360962 + }, + { + "auxiliary_loss_clip": 0.01175765, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.06136703, + "balance_loss_mlp": 1.02341461, + "epoch": 0.343654181446522, + "flos": 22199366413440.0, + "grad_norm": 1.7485965374897743, + "language_loss": 0.70266128, + "learning_rate": 3.0539160046052094e-06, + "loss": 0.724756, + "num_input_tokens_seen": 61472915, + "step": 2858, + "time_per_iteration": 2.531754493713379 + }, + { + "auxiliary_loss_clip": 0.01156407, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.05174041, + "balance_loss_mlp": 1.02662969, + "epoch": 0.34377442433716104, + "flos": 19901801894400.0, + "grad_norm": 2.3387116561969172, + "language_loss": 0.70100939, + "learning_rate": 3.0532538834376003e-06, + "loss": 0.72294044, + "num_input_tokens_seen": 61492475, + "step": 2859, + "time_per_iteration": 2.5214338302612305 + }, + { + "auxiliary_loss_clip": 0.01181855, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.05591464, + "balance_loss_mlp": 1.02688098, + "epoch": 0.34389466722780015, + "flos": 22197678474240.0, + "grad_norm": 1.7691035631254464, + "language_loss": 0.78658879, + "learning_rate": 3.0525916024922143e-06, + "loss": 0.80876577, + "num_input_tokens_seen": 61511660, + "step": 2860, + "time_per_iteration": 2.461674213409424 + }, + { + "auxiliary_loss_clip": 0.01162466, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.05317581, + "balance_loss_mlp": 1.02409387, + "epoch": 0.34401491011843927, + "flos": 18624567110400.0, + "grad_norm": 2.8709792150611073, + "language_loss": 0.84490848, + "learning_rate": 3.0519291618695193e-06, + "loss": 0.86686194, + "num_input_tokens_seen": 61529060, + "step": 2861, + "time_per_iteration": 2.4676291942596436 + }, + { + "auxiliary_loss_clip": 0.01140195, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0478785, + "balance_loss_mlp": 1.02580857, + "epoch": 0.3441351530090783, + "flos": 17858197509120.0, + "grad_norm": 1.5933669564669064, + "language_loss": 0.75363326, + "learning_rate": 3.0512665616700065e-06, + "loss": 0.77537769, + "num_input_tokens_seen": 61548125, + "step": 2862, + "time_per_iteration": 2.509248971939087 + }, + { + "auxiliary_loss_clip": 0.01126235, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.04855323, + "balance_loss_mlp": 1.02740145, + "epoch": 0.34425539589971743, + "flos": 23112754381440.0, + "grad_norm": 1.8061499002611905, + "language_loss": 0.89301401, + "learning_rate": 3.0506038019941933e-06, + "loss": 0.91463596, + "num_input_tokens_seen": 61568135, + "step": 2863, + "time_per_iteration": 2.658419132232666 + }, + { + "auxiliary_loss_clip": 0.01149556, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.05410445, + "balance_loss_mlp": 1.01834762, + "epoch": 0.34437563879035654, + "flos": 21907699977600.0, + "grad_norm": 3.3593188937805154, + "language_loss": 0.67939389, + "learning_rate": 3.049940882942617e-06, + "loss": 0.70115948, + "num_input_tokens_seen": 61586920, + "step": 2864, + "time_per_iteration": 2.616769790649414 + }, + { + "auxiliary_loss_clip": 0.01191192, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.05491972, + "balance_loss_mlp": 1.02242041, + "epoch": 0.3444958816809956, + "flos": 23076915586560.0, + "grad_norm": 10.237397370839172, + "language_loss": 0.80480927, + "learning_rate": 3.0492778046158448e-06, + "loss": 0.82703823, + "num_input_tokens_seen": 61608340, + "step": 2865, + "time_per_iteration": 2.4431538581848145 + }, + { + "auxiliary_loss_clip": 0.01178974, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.05908966, + "balance_loss_mlp": 1.0240171, + "epoch": 0.3446161245716347, + "flos": 21908633731200.0, + "grad_norm": 1.9731235851958284, + "language_loss": 0.77011299, + "learning_rate": 3.0486145671144633e-06, + "loss": 0.79222465, + "num_input_tokens_seen": 61628130, + "step": 2866, + "time_per_iteration": 2.4715888500213623 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.0438087, + "balance_loss_mlp": 1.0259831, + "epoch": 0.3447363674622738, + "flos": 25112834461440.0, + "grad_norm": 2.543567343040239, + "language_loss": 0.76915431, + "learning_rate": 3.047951170539086e-06, + "loss": 0.7905004, + "num_input_tokens_seen": 61647755, + "step": 2867, + "time_per_iteration": 2.6193974018096924 + }, + { + "auxiliary_loss_clip": 0.01148248, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.05726612, + "balance_loss_mlp": 1.03155136, + "epoch": 0.3448566103529129, + "flos": 11984684451840.0, + "grad_norm": 1.8275632005147604, + "language_loss": 0.83997416, + "learning_rate": 3.047287614990349e-06, + "loss": 0.8618539, + "num_input_tokens_seen": 61665675, + "step": 2868, + "time_per_iteration": 2.513073205947876 + }, + { + "auxiliary_loss_clip": 0.01158404, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.05368495, + "balance_loss_mlp": 1.02187431, + "epoch": 0.344976853243552, + "flos": 40187882465280.0, + "grad_norm": 2.342040551024905, + "language_loss": 0.62204897, + "learning_rate": 3.046623900568914e-06, + "loss": 0.64394498, + "num_input_tokens_seen": 61688240, + "step": 2869, + "time_per_iteration": 2.650275468826294 + }, + { + "auxiliary_loss_clip": 0.01159393, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.05224645, + "balance_loss_mlp": 1.0245111, + "epoch": 0.34509709613419104, + "flos": 28723652127360.0, + "grad_norm": 5.212375758726025, + "language_loss": 0.69895619, + "learning_rate": 3.045960027375465e-06, + "loss": 0.72088671, + "num_input_tokens_seen": 61706075, + "step": 2870, + "time_per_iteration": 3.398531675338745 + }, + { + "auxiliary_loss_clip": 0.01182474, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.05527139, + "balance_loss_mlp": 1.02251267, + "epoch": 0.34521733902483015, + "flos": 29967597982080.0, + "grad_norm": 4.708531668857249, + "language_loss": 0.82405508, + "learning_rate": 3.045295995510711e-06, + "loss": 0.84620118, + "num_input_tokens_seen": 61723045, + "step": 2871, + "time_per_iteration": 2.5406501293182373 + }, + { + "auxiliary_loss_clip": 0.01159498, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.05462241, + "balance_loss_mlp": 1.02331328, + "epoch": 0.34533758191546926, + "flos": 27923059843200.0, + "grad_norm": 2.0246450591988254, + "language_loss": 0.7362324, + "learning_rate": 3.0446318050753865e-06, + "loss": 0.75814581, + "num_input_tokens_seen": 61743525, + "step": 2872, + "time_per_iteration": 2.5530900955200195 + }, + { + "auxiliary_loss_clip": 0.01171955, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.05520427, + "balance_loss_mlp": 1.02305448, + "epoch": 0.3454578248061083, + "flos": 27125879351040.0, + "grad_norm": 2.013350686971414, + "language_loss": 0.77449453, + "learning_rate": 3.0439674561702474e-06, + "loss": 0.79652596, + "num_input_tokens_seen": 61763025, + "step": 2873, + "time_per_iteration": 2.594564914703369 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.05588913, + "balance_loss_mlp": 1.02320588, + "epoch": 0.3455780676967474, + "flos": 19024899166080.0, + "grad_norm": 2.378899857448509, + "language_loss": 0.88083708, + "learning_rate": 3.043302948896076e-06, + "loss": 0.90290093, + "num_input_tokens_seen": 61781630, + "step": 2874, + "time_per_iteration": 2.458533525466919 + }, + { + "auxiliary_loss_clip": 0.01124766, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.05099654, + "balance_loss_mlp": 1.02399421, + "epoch": 0.34569831058738654, + "flos": 34496005507200.0, + "grad_norm": 2.1854991874801555, + "language_loss": 0.60771257, + "learning_rate": 3.0426382833536756e-06, + "loss": 0.6292907, + "num_input_tokens_seen": 61804985, + "step": 2875, + "time_per_iteration": 4.31379508972168 + }, + { + "auxiliary_loss_clip": 0.01143287, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.04923487, + "balance_loss_mlp": 1.02156055, + "epoch": 0.3458185534780256, + "flos": 31138681098240.0, + "grad_norm": 2.264687999635517, + "language_loss": 0.77959049, + "learning_rate": 3.041973459643877e-06, + "loss": 0.80132473, + "num_input_tokens_seen": 61824440, + "step": 2876, + "time_per_iteration": 3.3671629428863525 + }, + { + "auxiliary_loss_clip": 0.0112489, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.04592729, + "balance_loss_mlp": 1.02159607, + "epoch": 0.3459387963686647, + "flos": 32452508862720.0, + "grad_norm": 2.1421198147396643, + "language_loss": 0.67067444, + "learning_rate": 3.0413084778675334e-06, + "loss": 0.69222462, + "num_input_tokens_seen": 61845690, + "step": 2877, + "time_per_iteration": 2.657008409500122 + }, + { + "auxiliary_loss_clip": 0.01153671, + "auxiliary_loss_mlp": 0.00763408, + "balance_loss_clip": 1.04934549, + "balance_loss_mlp": 1.00064945, + "epoch": 0.3460590392593038, + "flos": 24675658030080.0, + "grad_norm": 2.0852301773977917, + "language_loss": 0.8401655, + "learning_rate": 3.0406433381255214e-06, + "loss": 0.85933626, + "num_input_tokens_seen": 61863725, + "step": 2878, + "time_per_iteration": 2.548107385635376 + }, + { + "auxiliary_loss_clip": 0.01175807, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.05775738, + "balance_loss_mlp": 1.01937246, + "epoch": 0.34617928214994287, + "flos": 18807316531200.0, + "grad_norm": 3.048187612612579, + "language_loss": 0.82509327, + "learning_rate": 3.0399780405187425e-06, + "loss": 0.84712672, + "num_input_tokens_seen": 61882720, + "step": 2879, + "time_per_iteration": 2.463813066482544 + }, + { + "auxiliary_loss_clip": 0.01174518, + "auxiliary_loss_mlp": 0.01027048, + "balance_loss_clip": 1.05523729, + "balance_loss_mlp": 1.01909065, + "epoch": 0.346299525040582, + "flos": 24857653265280.0, + "grad_norm": 2.4315994336817637, + "language_loss": 0.78618073, + "learning_rate": 3.0393125851481216e-06, + "loss": 0.80819643, + "num_input_tokens_seen": 61902595, + "step": 2880, + "time_per_iteration": 2.514599084854126 + }, + { + "auxiliary_loss_clip": 0.01146137, + "auxiliary_loss_mlp": 0.01024417, + "balance_loss_clip": 1.05446315, + "balance_loss_mlp": 1.01624501, + "epoch": 0.3464197679312211, + "flos": 16434914025600.0, + "grad_norm": 2.2383979210936613, + "language_loss": 0.86767185, + "learning_rate": 3.038646972114608e-06, + "loss": 0.88937736, + "num_input_tokens_seen": 61918920, + "step": 2881, + "time_per_iteration": 2.5247910022735596 + }, + { + "auxiliary_loss_clip": 0.0114468, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.05382347, + "balance_loss_mlp": 1.03275335, + "epoch": 0.34654001082186014, + "flos": 22382474970240.0, + "grad_norm": 1.7562835057200905, + "language_loss": 0.67518997, + "learning_rate": 3.037981201519174e-06, + "loss": 0.69704974, + "num_input_tokens_seen": 61939520, + "step": 2882, + "time_per_iteration": 2.561526298522949 + }, + { + "auxiliary_loss_clip": 0.01178516, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.05910099, + "balance_loss_mlp": 1.02461433, + "epoch": 0.34666025371249926, + "flos": 19573901614080.0, + "grad_norm": 2.7169571989647916, + "language_loss": 0.71286345, + "learning_rate": 3.0373152734628175e-06, + "loss": 0.73497903, + "num_input_tokens_seen": 61957800, + "step": 2883, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.01172466, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.0540055, + "balance_loss_mlp": 1.01847339, + "epoch": 0.34678049660313837, + "flos": 15267637751040.0, + "grad_norm": 2.152190947245872, + "language_loss": 0.76223826, + "learning_rate": 3.0366491880465584e-06, + "loss": 0.78423667, + "num_input_tokens_seen": 61975820, + "step": 2884, + "time_per_iteration": 2.4612512588500977 + }, + { + "auxiliary_loss_clip": 0.01195421, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.059726, + "balance_loss_mlp": 1.02328932, + "epoch": 0.3469007394937774, + "flos": 21181550630400.0, + "grad_norm": 1.613784129173304, + "language_loss": 0.82047951, + "learning_rate": 3.035982945371443e-06, + "loss": 0.84275854, + "num_input_tokens_seen": 61997515, + "step": 2885, + "time_per_iteration": 2.4990787506103516 + }, + { + "auxiliary_loss_clip": 0.01170348, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.05553651, + "balance_loss_mlp": 1.02049804, + "epoch": 0.34702098238441653, + "flos": 22375471818240.0, + "grad_norm": 2.055747682232124, + "language_loss": 0.85413849, + "learning_rate": 3.035316545538537e-06, + "loss": 0.87613809, + "num_input_tokens_seen": 62016310, + "step": 2886, + "time_per_iteration": 2.521435260772705 + }, + { + "auxiliary_loss_clip": 0.01165092, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.06063163, + "balance_loss_mlp": 1.02253628, + "epoch": 0.3471412252750556, + "flos": 22929430343040.0, + "grad_norm": 2.649077036405507, + "language_loss": 0.7893368, + "learning_rate": 3.034649988648935e-06, + "loss": 0.81129944, + "num_input_tokens_seen": 62036075, + "step": 2887, + "time_per_iteration": 2.516091823577881 + }, + { + "auxiliary_loss_clip": 0.01167655, + "auxiliary_loss_mlp": 0.01026649, + "balance_loss_clip": 1.0553093, + "balance_loss_mlp": 1.01782775, + "epoch": 0.3472614681656947, + "flos": 21324259365120.0, + "grad_norm": 1.682397715866157, + "language_loss": 0.8068493, + "learning_rate": 3.033983274803752e-06, + "loss": 0.82879233, + "num_input_tokens_seen": 62055865, + "step": 2888, + "time_per_iteration": 2.536759853363037 + }, + { + "auxiliary_loss_clip": 0.01160019, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.05383372, + "balance_loss_mlp": 1.02261615, + "epoch": 0.3473817110563338, + "flos": 23475739271040.0, + "grad_norm": 4.246333451645234, + "language_loss": 0.7272895, + "learning_rate": 3.0333164041041283e-06, + "loss": 0.74920505, + "num_input_tokens_seen": 62072180, + "step": 2889, + "time_per_iteration": 2.5323047637939453 + }, + { + "auxiliary_loss_clip": 0.01121529, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.04906702, + "balance_loss_mlp": 1.01687253, + "epoch": 0.34750195394697286, + "flos": 22346025644160.0, + "grad_norm": 2.175184053088743, + "language_loss": 0.71810549, + "learning_rate": 3.032649376651228e-06, + "loss": 0.73957253, + "num_input_tokens_seen": 62091600, + "step": 2890, + "time_per_iteration": 2.6042559146881104 + }, + { + "auxiliary_loss_clip": 0.01150244, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.05282807, + "balance_loss_mlp": 1.01841486, + "epoch": 0.347622196837612, + "flos": 29095004885760.0, + "grad_norm": 3.4002649271359715, + "language_loss": 0.75715297, + "learning_rate": 3.031982192546238e-06, + "loss": 0.77893382, + "num_input_tokens_seen": 62114695, + "step": 2891, + "time_per_iteration": 2.593083381652832 + }, + { + "auxiliary_loss_clip": 0.01178937, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.05558956, + "balance_loss_mlp": 1.02431178, + "epoch": 0.3477424397282511, + "flos": 22455732758400.0, + "grad_norm": 1.996430127765452, + "language_loss": 0.94606566, + "learning_rate": 3.0313148518903696e-06, + "loss": 0.9681834, + "num_input_tokens_seen": 62134520, + "step": 2892, + "time_per_iteration": 2.484234571456909 + }, + { + "auxiliary_loss_clip": 0.01167416, + "auxiliary_loss_mlp": 0.01028234, + "balance_loss_clip": 1.05686927, + "balance_loss_mlp": 1.01939476, + "epoch": 0.34786268261889014, + "flos": 15778790242560.0, + "grad_norm": 2.0046582628429657, + "language_loss": 0.81266308, + "learning_rate": 3.030647354784859e-06, + "loss": 0.83461958, + "num_input_tokens_seen": 62151560, + "step": 2893, + "time_per_iteration": 2.498206377029419 + }, + { + "auxiliary_loss_clip": 0.01150919, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.05435252, + "balance_loss_mlp": 1.02456331, + "epoch": 0.34798292550952925, + "flos": 20777627214720.0, + "grad_norm": 1.7405890773186343, + "language_loss": 0.76891041, + "learning_rate": 3.029979701330964e-06, + "loss": 0.79075205, + "num_input_tokens_seen": 62170985, + "step": 2894, + "time_per_iteration": 2.5509817600250244 + }, + { + "auxiliary_loss_clip": 0.0116866, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.05564237, + "balance_loss_mlp": 1.023646, + "epoch": 0.34810316840016836, + "flos": 19937820257280.0, + "grad_norm": 2.271278945655884, + "language_loss": 0.80084401, + "learning_rate": 3.029311891629966e-06, + "loss": 0.82285404, + "num_input_tokens_seen": 62189440, + "step": 2895, + "time_per_iteration": 2.5065693855285645 + }, + { + "auxiliary_loss_clip": 0.01159761, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.05475485, + "balance_loss_mlp": 1.02717996, + "epoch": 0.3482234112908074, + "flos": 23623296341760.0, + "grad_norm": 1.7273877714629253, + "language_loss": 0.73963678, + "learning_rate": 3.0286439257831744e-06, + "loss": 0.76159817, + "num_input_tokens_seen": 62208910, + "step": 2896, + "time_per_iteration": 3.395796298980713 + }, + { + "auxiliary_loss_clip": 0.01198129, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.05938578, + "balance_loss_mlp": 1.02655709, + "epoch": 0.3483436541814465, + "flos": 23986712194560.0, + "grad_norm": 3.16386010005365, + "language_loss": 0.71699721, + "learning_rate": 3.0279758038919156e-06, + "loss": 0.73935062, + "num_input_tokens_seen": 62227135, + "step": 2897, + "time_per_iteration": 2.4587204456329346 + }, + { + "auxiliary_loss_clip": 0.01179106, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.05689836, + "balance_loss_mlp": 1.02384305, + "epoch": 0.34846389707208564, + "flos": 22638338524800.0, + "grad_norm": 1.6537049686488743, + "language_loss": 0.77854073, + "learning_rate": 3.0273075260575455e-06, + "loss": 0.80066139, + "num_input_tokens_seen": 62246035, + "step": 2898, + "time_per_iteration": 2.4945077896118164 + }, + { + "auxiliary_loss_clip": 0.01166885, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.05392015, + "balance_loss_mlp": 1.02496982, + "epoch": 0.3485841399627247, + "flos": 21792857218560.0, + "grad_norm": 1.8870563275997037, + "language_loss": 0.80711806, + "learning_rate": 3.0266390923814396e-06, + "loss": 0.82913768, + "num_input_tokens_seen": 62264095, + "step": 2899, + "time_per_iteration": 2.508171319961548 + }, + { + "auxiliary_loss_clip": 0.01170637, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.06040359, + "balance_loss_mlp": 1.02800679, + "epoch": 0.3487043828533638, + "flos": 17019036996480.0, + "grad_norm": 1.8152172148819326, + "language_loss": 0.82229769, + "learning_rate": 3.0259705029650008e-06, + "loss": 0.8443836, + "num_input_tokens_seen": 62282025, + "step": 2900, + "time_per_iteration": 2.485881805419922 + }, + { + "auxiliary_loss_clip": 0.01180616, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.05577207, + "balance_loss_mlp": 1.01788545, + "epoch": 0.34882462574400286, + "flos": 22601135013120.0, + "grad_norm": 1.6252599133425325, + "language_loss": 0.72880208, + "learning_rate": 3.025301757909652e-06, + "loss": 0.75086969, + "num_input_tokens_seen": 62302220, + "step": 2901, + "time_per_iteration": 2.4926414489746094 + }, + { + "auxiliary_loss_clip": 0.0115377, + "auxiliary_loss_mlp": 0.00764605, + "balance_loss_clip": 1.05450368, + "balance_loss_mlp": 1.00065589, + "epoch": 0.34894486863464197, + "flos": 29861518141440.0, + "grad_norm": 1.783490674131647, + "language_loss": 0.80567622, + "learning_rate": 3.024632857316842e-06, + "loss": 0.82485998, + "num_input_tokens_seen": 62323535, + "step": 2902, + "time_per_iteration": 4.201333999633789 + }, + { + "auxiliary_loss_clip": 0.0118364, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.06009841, + "balance_loss_mlp": 1.01966381, + "epoch": 0.3490651115252811, + "flos": 22122265870080.0, + "grad_norm": 1.919950527440679, + "language_loss": 0.77397877, + "learning_rate": 3.0239638012880412e-06, + "loss": 0.79610479, + "num_input_tokens_seen": 62343430, + "step": 2903, + "time_per_iteration": 3.2788820266723633 + }, + { + "auxiliary_loss_clip": 0.0112799, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.04885983, + "balance_loss_mlp": 1.02212143, + "epoch": 0.34918535441592014, + "flos": 12676682943360.0, + "grad_norm": 2.3410715635561457, + "language_loss": 0.81419742, + "learning_rate": 3.0232945899247466e-06, + "loss": 0.83579409, + "num_input_tokens_seen": 62360365, + "step": 2904, + "time_per_iteration": 2.7015492916107178 + }, + { + "auxiliary_loss_clip": 0.01178911, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.05403042, + "balance_loss_mlp": 1.02729511, + "epoch": 0.34930559730655925, + "flos": 23185617120000.0, + "grad_norm": 1.776881880303038, + "language_loss": 0.77339458, + "learning_rate": 3.022625223328476e-06, + "loss": 0.79555392, + "num_input_tokens_seen": 62382105, + "step": 2905, + "time_per_iteration": 2.6516213417053223 + }, + { + "auxiliary_loss_clip": 0.01186029, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.05825877, + "balance_loss_mlp": 1.02438486, + "epoch": 0.34942584019719836, + "flos": 22855023319680.0, + "grad_norm": 1.393738410815465, + "language_loss": 0.69099772, + "learning_rate": 3.0219557016007723e-06, + "loss": 0.71321005, + "num_input_tokens_seen": 62402235, + "step": 2906, + "time_per_iteration": 2.589181661605835 + }, + { + "auxiliary_loss_clip": 0.01176101, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.05693066, + "balance_loss_mlp": 1.02091467, + "epoch": 0.3495460830878374, + "flos": 24426043441920.0, + "grad_norm": 1.8369676914442659, + "language_loss": 0.6969316, + "learning_rate": 3.021286024843202e-06, + "loss": 0.71899647, + "num_input_tokens_seen": 62420430, + "step": 2907, + "time_per_iteration": 2.490055799484253 + }, + { + "auxiliary_loss_clip": 0.0109323, + "auxiliary_loss_mlp": 0.01002864, + "balance_loss_clip": 1.02513075, + "balance_loss_mlp": 1.00180876, + "epoch": 0.3496663259784765, + "flos": 70008749389440.0, + "grad_norm": 1.0726586884330434, + "language_loss": 0.6485374, + "learning_rate": 3.0206161931573526e-06, + "loss": 0.66949832, + "num_input_tokens_seen": 62472980, + "step": 2908, + "time_per_iteration": 2.960287570953369 + }, + { + "auxiliary_loss_clip": 0.01160997, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.05074692, + "balance_loss_mlp": 1.0265795, + "epoch": 0.34978656886911563, + "flos": 28692805322880.0, + "grad_norm": 1.603290085295026, + "language_loss": 0.92862695, + "learning_rate": 3.0199462066448388e-06, + "loss": 0.95058757, + "num_input_tokens_seen": 62495175, + "step": 2909, + "time_per_iteration": 2.577993392944336 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.05983531, + "balance_loss_mlp": 1.02118409, + "epoch": 0.3499068117597547, + "flos": 21142156389120.0, + "grad_norm": 1.8314536908633274, + "language_loss": 0.69664884, + "learning_rate": 3.019276065407296e-06, + "loss": 0.71877223, + "num_input_tokens_seen": 62514295, + "step": 2910, + "time_per_iteration": 2.47017765045166 + }, + { + "auxiliary_loss_clip": 0.01138413, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.05168366, + "balance_loss_mlp": 1.02610993, + "epoch": 0.3500270546503938, + "flos": 22782699285120.0, + "grad_norm": 1.8468371536414188, + "language_loss": 0.80348551, + "learning_rate": 3.018605769546385e-06, + "loss": 0.82522392, + "num_input_tokens_seen": 62534850, + "step": 2911, + "time_per_iteration": 2.575129508972168 + }, + { + "auxiliary_loss_clip": 0.0117745, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.05382395, + "balance_loss_mlp": 1.0245049, + "epoch": 0.3501472975410329, + "flos": 22894058424960.0, + "grad_norm": 1.804305570846599, + "language_loss": 0.79593498, + "learning_rate": 3.017935319163788e-06, + "loss": 0.81805086, + "num_input_tokens_seen": 62553810, + "step": 2912, + "time_per_iteration": 2.48646879196167 + }, + { + "auxiliary_loss_clip": 0.01180931, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.05763423, + "balance_loss_mlp": 1.0240376, + "epoch": 0.35026754043167196, + "flos": 25446588658560.0, + "grad_norm": 3.3857023562322914, + "language_loss": 0.70608127, + "learning_rate": 3.017264714361213e-06, + "loss": 0.72823489, + "num_input_tokens_seen": 62573460, + "step": 2913, + "time_per_iteration": 2.4986729621887207 + }, + { + "auxiliary_loss_clip": 0.01164549, + "auxiliary_loss_mlp": 0.00764494, + "balance_loss_clip": 1.05506444, + "balance_loss_mlp": 1.00079739, + "epoch": 0.3503877833223111, + "flos": 19573757959680.0, + "grad_norm": 1.8962038542750148, + "language_loss": 0.82142198, + "learning_rate": 3.016593955240389e-06, + "loss": 0.84071243, + "num_input_tokens_seen": 62592150, + "step": 2914, + "time_per_iteration": 2.5073235034942627 + }, + { + "auxiliary_loss_clip": 0.01079893, + "auxiliary_loss_mlp": 0.010011, + "balance_loss_clip": 1.02345228, + "balance_loss_mlp": 1.0000447, + "epoch": 0.3505080262129502, + "flos": 65072075880960.0, + "grad_norm": 0.822748475037358, + "language_loss": 0.63760668, + "learning_rate": 3.015923041903071e-06, + "loss": 0.65841663, + "num_input_tokens_seen": 62658275, + "step": 2915, + "time_per_iteration": 3.100501537322998 + }, + { + "auxiliary_loss_clip": 0.01181317, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.06030369, + "balance_loss_mlp": 1.02055824, + "epoch": 0.35062826910358924, + "flos": 29314562768640.0, + "grad_norm": 2.3166085606717055, + "language_loss": 0.8339988, + "learning_rate": 3.0152519744510347e-06, + "loss": 0.85611582, + "num_input_tokens_seen": 62678075, + "step": 2916, + "time_per_iteration": 2.5334339141845703 + }, + { + "auxiliary_loss_clip": 0.01152459, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.05300522, + "balance_loss_mlp": 1.0243094, + "epoch": 0.35074851199422835, + "flos": 23987717775360.0, + "grad_norm": 1.8136290003301474, + "language_loss": 0.82546228, + "learning_rate": 3.014580752986081e-06, + "loss": 0.84732509, + "num_input_tokens_seen": 62696950, + "step": 2917, + "time_per_iteration": 2.5631556510925293 + }, + { + "auxiliary_loss_clip": 0.01139605, + "auxiliary_loss_mlp": 0.01035894, + "balance_loss_clip": 1.05364418, + "balance_loss_mlp": 1.02705419, + "epoch": 0.3508687548848674, + "flos": 15224436668160.0, + "grad_norm": 2.3067430055861204, + "language_loss": 0.78285944, + "learning_rate": 3.0139093776100345e-06, + "loss": 0.80461442, + "num_input_tokens_seen": 62713540, + "step": 2918, + "time_per_iteration": 2.5434465408325195 + }, + { + "auxiliary_loss_clip": 0.01190736, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.05679631, + "balance_loss_mlp": 1.02061892, + "epoch": 0.3509889977755065, + "flos": 21361750185600.0, + "grad_norm": 1.7939210097053386, + "language_loss": 0.75197208, + "learning_rate": 3.013237848424741e-06, + "loss": 0.77417737, + "num_input_tokens_seen": 62732925, + "step": 2919, + "time_per_iteration": 2.4598734378814697 + }, + { + "auxiliary_loss_clip": 0.01166497, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.05639613, + "balance_loss_mlp": 1.0256592, + "epoch": 0.35110924066614563, + "flos": 19135360465920.0, + "grad_norm": 4.647650362100503, + "language_loss": 0.75478858, + "learning_rate": 3.012566165532072e-06, + "loss": 0.77679908, + "num_input_tokens_seen": 62751715, + "step": 2920, + "time_per_iteration": 2.514832019805908 + }, + { + "auxiliary_loss_clip": 0.01129612, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.05160165, + "balance_loss_mlp": 1.02517498, + "epoch": 0.3512294835567847, + "flos": 21980885938560.0, + "grad_norm": 1.9924305951716799, + "language_loss": 0.76581919, + "learning_rate": 3.0118943290339207e-06, + "loss": 0.78745663, + "num_input_tokens_seen": 62771925, + "step": 2921, + "time_per_iteration": 2.6265459060668945 + }, + { + "auxiliary_loss_clip": 0.01142973, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.0510267, + "balance_loss_mlp": 1.02229762, + "epoch": 0.3513497264474238, + "flos": 17817294896640.0, + "grad_norm": 2.1042732354260294, + "language_loss": 0.68369895, + "learning_rate": 3.011222339032204e-06, + "loss": 0.7054463, + "num_input_tokens_seen": 62790075, + "step": 2922, + "time_per_iteration": 2.5352516174316406 + }, + { + "auxiliary_loss_clip": 0.01191693, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.05668116, + "balance_loss_mlp": 1.02787066, + "epoch": 0.3514699693380629, + "flos": 26943417239040.0, + "grad_norm": 1.7984535259243666, + "language_loss": 0.69699448, + "learning_rate": 3.0105501956288626e-06, + "loss": 0.7192862, + "num_input_tokens_seen": 62810545, + "step": 2923, + "time_per_iteration": 3.3468363285064697 + }, + { + "auxiliary_loss_clip": 0.0118585, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.05740762, + "balance_loss_mlp": 1.02428055, + "epoch": 0.35159021222870196, + "flos": 15267565923840.0, + "grad_norm": 1.9478105332390803, + "language_loss": 0.72798163, + "learning_rate": 3.0098778989258602e-06, + "loss": 0.7501806, + "num_input_tokens_seen": 62829155, + "step": 2924, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01145254, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.05203819, + "balance_loss_mlp": 1.02738249, + "epoch": 0.35171045511934107, + "flos": 13984154000640.0, + "grad_norm": 2.951258086630555, + "language_loss": 0.8815009, + "learning_rate": 3.009205449025183e-06, + "loss": 0.90331817, + "num_input_tokens_seen": 62845350, + "step": 2925, + "time_per_iteration": 2.51007080078125 + }, + { + "auxiliary_loss_clip": 0.01146694, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.05057228, + "balance_loss_mlp": 1.02404332, + "epoch": 0.3518306980099802, + "flos": 14283434119680.0, + "grad_norm": 1.9469640914707576, + "language_loss": 0.63222831, + "learning_rate": 3.008532846028842e-06, + "loss": 0.65402818, + "num_input_tokens_seen": 62862110, + "step": 2926, + "time_per_iteration": 2.513120412826538 + }, + { + "auxiliary_loss_clip": 0.01197455, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.06008434, + "balance_loss_mlp": 1.02777159, + "epoch": 0.35195094090061924, + "flos": 27052872958080.0, + "grad_norm": 2.6667906545515296, + "language_loss": 0.72278976, + "learning_rate": 3.0078600900388694e-06, + "loss": 0.74514151, + "num_input_tokens_seen": 62882415, + "step": 2927, + "time_per_iteration": 2.4651401042938232 + }, + { + "auxiliary_loss_clip": 0.01138861, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.04865503, + "balance_loss_mlp": 1.02143335, + "epoch": 0.35207118379125835, + "flos": 25629266252160.0, + "grad_norm": 1.8224458948767925, + "language_loss": 0.73845279, + "learning_rate": 3.007187181157323e-06, + "loss": 0.76015192, + "num_input_tokens_seen": 62902425, + "step": 2928, + "time_per_iteration": 2.5711920261383057 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.04720783, + "balance_loss_mlp": 1.01982498, + "epoch": 0.35219142668189746, + "flos": 18004713085440.0, + "grad_norm": 2.219903385204011, + "language_loss": 0.68447924, + "learning_rate": 3.006514119486282e-06, + "loss": 0.70588905, + "num_input_tokens_seen": 62919255, + "step": 2929, + "time_per_iteration": 4.204680681228638 + }, + { + "auxiliary_loss_clip": 0.01142461, + "auxiliary_loss_mlp": 0.01027718, + "balance_loss_clip": 1.05109572, + "balance_loss_mlp": 1.01898611, + "epoch": 0.3523116695725365, + "flos": 14028109269120.0, + "grad_norm": 1.7345613065517642, + "language_loss": 0.69631529, + "learning_rate": 3.005840905127849e-06, + "loss": 0.7180171, + "num_input_tokens_seen": 62936160, + "step": 2930, + "time_per_iteration": 2.516960382461548 + }, + { + "auxiliary_loss_clip": 0.01193654, + "auxiliary_loss_mlp": 0.00764166, + "balance_loss_clip": 1.06096816, + "balance_loss_mlp": 1.00087798, + "epoch": 0.3524319124631756, + "flos": 21433966479360.0, + "grad_norm": 2.3048826319079865, + "language_loss": 0.87105119, + "learning_rate": 3.0051675381841516e-06, + "loss": 0.89062941, + "num_input_tokens_seen": 62953470, + "step": 2931, + "time_per_iteration": 2.4634788036346436 + }, + { + "auxiliary_loss_clip": 0.01106299, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.04673111, + "balance_loss_mlp": 1.02665281, + "epoch": 0.3525521553538147, + "flos": 26322773114880.0, + "grad_norm": 1.600391766366554, + "language_loss": 0.76800299, + "learning_rate": 3.0044940187573363e-06, + "loss": 0.78942549, + "num_input_tokens_seen": 62974480, + "step": 2932, + "time_per_iteration": 2.670949935913086 + }, + { + "auxiliary_loss_clip": 0.01182458, + "auxiliary_loss_mlp": 0.01034208, + "balance_loss_clip": 1.05614138, + "balance_loss_mlp": 1.02508903, + "epoch": 0.3526723982444538, + "flos": 21543314457600.0, + "grad_norm": 1.7523460779030162, + "language_loss": 0.65343928, + "learning_rate": 3.003820346949578e-06, + "loss": 0.67560595, + "num_input_tokens_seen": 62992560, + "step": 2933, + "time_per_iteration": 2.576967477798462 + }, + { + "auxiliary_loss_clip": 0.01192051, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.02417004, + "epoch": 0.3527926411350929, + "flos": 23733649900800.0, + "grad_norm": 1.8096476756193962, + "language_loss": 0.79540652, + "learning_rate": 3.003146522863071e-06, + "loss": 0.81766337, + "num_input_tokens_seen": 63013445, + "step": 2934, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.0116387, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.05665207, + "balance_loss_mlp": 1.02315843, + "epoch": 0.35291288402573195, + "flos": 30445461544320.0, + "grad_norm": 2.1890740253179786, + "language_loss": 0.85630852, + "learning_rate": 3.0024725466000345e-06, + "loss": 0.87826431, + "num_input_tokens_seen": 63033400, + "step": 2935, + "time_per_iteration": 2.5992751121520996 + }, + { + "auxiliary_loss_clip": 0.01179127, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.05850887, + "balance_loss_mlp": 1.01947129, + "epoch": 0.35303312691637107, + "flos": 23112179763840.0, + "grad_norm": 1.662825815363589, + "language_loss": 0.78672743, + "learning_rate": 3.0017984182627087e-06, + "loss": 0.80879587, + "num_input_tokens_seen": 63052725, + "step": 2936, + "time_per_iteration": 2.496722459793091 + }, + { + "auxiliary_loss_clip": 0.01148166, + "auxiliary_loss_mlp": 0.00764392, + "balance_loss_clip": 1.0506084, + "balance_loss_mlp": 1.00084865, + "epoch": 0.3531533698070102, + "flos": 21835699165440.0, + "grad_norm": 18.760500707031348, + "language_loss": 0.82230437, + "learning_rate": 3.00112413795336e-06, + "loss": 0.84143001, + "num_input_tokens_seen": 63072560, + "step": 2937, + "time_per_iteration": 2.68031644821167 + }, + { + "auxiliary_loss_clip": 0.01159717, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.05035639, + "balance_loss_mlp": 1.02319145, + "epoch": 0.35327361269764923, + "flos": 15778969810560.0, + "grad_norm": 1.8526473721634917, + "language_loss": 0.8030709, + "learning_rate": 3.000449705774275e-06, + "loss": 0.82499111, + "num_input_tokens_seen": 63090800, + "step": 2938, + "time_per_iteration": 2.502084732055664 + }, + { + "auxiliary_loss_clip": 0.01179959, + "auxiliary_loss_mlp": 0.01027559, + "balance_loss_clip": 1.05810738, + "balance_loss_mlp": 1.01852274, + "epoch": 0.35339385558828834, + "flos": 22090413484800.0, + "grad_norm": 2.5738479897557567, + "language_loss": 0.71376145, + "learning_rate": 2.9997751218277654e-06, + "loss": 0.73583663, + "num_input_tokens_seen": 63108955, + "step": 2939, + "time_per_iteration": 2.479315996170044 + }, + { + "auxiliary_loss_clip": 0.01195111, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.06016338, + "balance_loss_mlp": 1.02243233, + "epoch": 0.35351409847892745, + "flos": 24165008328960.0, + "grad_norm": 1.883787022771459, + "language_loss": 0.77750146, + "learning_rate": 2.999100386216166e-06, + "loss": 0.79976946, + "num_input_tokens_seen": 63127895, + "step": 2940, + "time_per_iteration": 2.445446729660034 + }, + { + "auxiliary_loss_clip": 0.01164061, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.05529094, + "balance_loss_mlp": 1.02202201, + "epoch": 0.3536343413695665, + "flos": 27052298340480.0, + "grad_norm": 1.8192266325998778, + "language_loss": 0.7428627, + "learning_rate": 2.998425499041831e-06, + "loss": 0.76481116, + "num_input_tokens_seen": 63148410, + "step": 2941, + "time_per_iteration": 2.552973508834839 + }, + { + "auxiliary_loss_clip": 0.01077976, + "auxiliary_loss_mlp": 0.01001691, + "balance_loss_clip": 1.02188063, + "balance_loss_mlp": 1.00062418, + "epoch": 0.3537545842602056, + "flos": 65991066370560.0, + "grad_norm": 1.3645667159315482, + "language_loss": 0.64607334, + "learning_rate": 2.997750460407142e-06, + "loss": 0.66687, + "num_input_tokens_seen": 63209765, + "step": 2942, + "time_per_iteration": 3.092388868331909 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.04984069, + "balance_loss_mlp": 1.01954281, + "epoch": 0.35387482715084473, + "flos": 18436897526400.0, + "grad_norm": 2.0288066105800926, + "language_loss": 0.70046389, + "learning_rate": 2.997075270414501e-06, + "loss": 0.72226107, + "num_input_tokens_seen": 63226980, + "step": 2943, + "time_per_iteration": 2.515299081802368 + }, + { + "auxiliary_loss_clip": 0.01068478, + "auxiliary_loss_mlp": 0.01000876, + "balance_loss_clip": 1.02372503, + "balance_loss_mlp": 0.9996959, + "epoch": 0.3539950700414838, + "flos": 65588579498880.0, + "grad_norm": 0.6984994446207212, + "language_loss": 0.57756382, + "learning_rate": 2.9963999291663347e-06, + "loss": 0.59825736, + "num_input_tokens_seen": 63292760, + "step": 2944, + "time_per_iteration": 3.091273784637451 + }, + { + "auxiliary_loss_clip": 0.01138614, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.0551548, + "balance_loss_mlp": 1.02364039, + "epoch": 0.3541153129321229, + "flos": 20521655919360.0, + "grad_norm": 2.0815134501411916, + "language_loss": 0.73934811, + "learning_rate": 2.9957244367650915e-06, + "loss": 0.76105368, + "num_input_tokens_seen": 63309005, + "step": 2945, + "time_per_iteration": 2.538546323776245 + }, + { + "auxiliary_loss_clip": 0.01131511, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.05332553, + "balance_loss_mlp": 1.0197742, + "epoch": 0.354235555822762, + "flos": 19573578391680.0, + "grad_norm": 1.7673821560908987, + "language_loss": 0.83683157, + "learning_rate": 2.9950487933132425e-06, + "loss": 0.85843694, + "num_input_tokens_seen": 63326420, + "step": 2946, + "time_per_iteration": 2.5497591495513916 + }, + { + "auxiliary_loss_clip": 0.01180689, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.05506551, + "balance_loss_mlp": 1.02214837, + "epoch": 0.35435579871340106, + "flos": 20777268078720.0, + "grad_norm": 2.542723600213033, + "language_loss": 0.71537042, + "learning_rate": 2.994372998913283e-06, + "loss": 0.73748249, + "num_input_tokens_seen": 63344925, + "step": 2947, + "time_per_iteration": 2.4698736667633057 + }, + { + "auxiliary_loss_clip": 0.01165576, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.05880082, + "balance_loss_mlp": 1.02449429, + "epoch": 0.35447604160404017, + "flos": 23951807153280.0, + "grad_norm": 2.848637827391769, + "language_loss": 0.62397897, + "learning_rate": 2.99369705366773e-06, + "loss": 0.64596748, + "num_input_tokens_seen": 63365170, + "step": 2948, + "time_per_iteration": 2.5240981578826904 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.05478275, + "balance_loss_mlp": 1.01985049, + "epoch": 0.3545962844946792, + "flos": 23435662671360.0, + "grad_norm": 2.057421355231936, + "language_loss": 0.82543683, + "learning_rate": 2.9930209576791244e-06, + "loss": 0.84733903, + "num_input_tokens_seen": 63383645, + "step": 2949, + "time_per_iteration": 2.5266072750091553 + }, + { + "auxiliary_loss_clip": 0.01176316, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.05739665, + "balance_loss_mlp": 1.02411985, + "epoch": 0.35471652738531834, + "flos": 22085134185600.0, + "grad_norm": 1.9203970493867377, + "language_loss": 0.63537633, + "learning_rate": 2.9923447110500285e-06, + "loss": 0.65746289, + "num_input_tokens_seen": 63402390, + "step": 2950, + "time_per_iteration": 3.2974531650543213 + }, + { + "auxiliary_loss_clip": 0.01166821, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.05480599, + "balance_loss_mlp": 1.02830553, + "epoch": 0.35483677027595745, + "flos": 27341881787520.0, + "grad_norm": 2.0659287671097832, + "language_loss": 0.75342083, + "learning_rate": 2.9916683138830295e-06, + "loss": 0.77546066, + "num_input_tokens_seen": 63423055, + "step": 2951, + "time_per_iteration": 2.5069446563720703 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.05561662, + "balance_loss_mlp": 1.02758908, + "epoch": 0.3549570131665965, + "flos": 13516166678400.0, + "grad_norm": 1.781025458989053, + "language_loss": 0.80889213, + "learning_rate": 2.9909917662807353e-06, + "loss": 0.83087969, + "num_input_tokens_seen": 63440855, + "step": 2952, + "time_per_iteration": 2.465482234954834 + }, + { + "auxiliary_loss_clip": 0.01174228, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.05458212, + "balance_loss_mlp": 1.03014016, + "epoch": 0.3550772560572356, + "flos": 20887549810560.0, + "grad_norm": 2.237762260331911, + "language_loss": 0.69040287, + "learning_rate": 2.9903150683457783e-06, + "loss": 0.71253651, + "num_input_tokens_seen": 63459400, + "step": 2953, + "time_per_iteration": 2.4536101818084717 + }, + { + "auxiliary_loss_clip": 0.01164352, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.05339575, + "balance_loss_mlp": 1.02217865, + "epoch": 0.3551974989478747, + "flos": 20194042947840.0, + "grad_norm": 1.8619957723256158, + "language_loss": 0.64979893, + "learning_rate": 2.9896382201808126e-06, + "loss": 0.67174768, + "num_input_tokens_seen": 63476800, + "step": 2954, + "time_per_iteration": 2.5076870918273926 + }, + { + "auxiliary_loss_clip": 0.01192801, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.05734336, + "balance_loss_mlp": 1.02325916, + "epoch": 0.3553177418385138, + "flos": 19828831415040.0, + "grad_norm": 2.3509888501696268, + "language_loss": 0.81204355, + "learning_rate": 2.988961221888516e-06, + "loss": 0.83429134, + "num_input_tokens_seen": 63493475, + "step": 2955, + "time_per_iteration": 4.845149993896484 + }, + { + "auxiliary_loss_clip": 0.01137827, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.04986143, + "balance_loss_mlp": 1.01952302, + "epoch": 0.3554379847291529, + "flos": 14829132516480.0, + "grad_norm": 3.1613313026581826, + "language_loss": 0.78882468, + "learning_rate": 2.988284073571589e-06, + "loss": 0.81048393, + "num_input_tokens_seen": 63509560, + "step": 2956, + "time_per_iteration": 2.496170997619629 + }, + { + "auxiliary_loss_clip": 0.01180597, + "auxiliary_loss_mlp": 0.00763554, + "balance_loss_clip": 1.05845833, + "balance_loss_mlp": 1.00084758, + "epoch": 0.355558227619792, + "flos": 20485350247680.0, + "grad_norm": 2.170774794026298, + "language_loss": 0.72462809, + "learning_rate": 2.9876067753327528e-06, + "loss": 0.74406964, + "num_input_tokens_seen": 63527290, + "step": 2957, + "time_per_iteration": 2.477116346359253 + }, + { + "auxiliary_loss_clip": 0.0118025, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.05509901, + "balance_loss_mlp": 1.03102446, + "epoch": 0.35567847051043106, + "flos": 37663613256960.0, + "grad_norm": 1.9891017048844266, + "language_loss": 0.80440634, + "learning_rate": 2.986929327274754e-06, + "loss": 0.82661378, + "num_input_tokens_seen": 63547870, + "step": 2958, + "time_per_iteration": 2.586879014968872 + }, + { + "auxiliary_loss_clip": 0.01179266, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.0598433, + "balance_loss_mlp": 1.02468848, + "epoch": 0.35579871340107017, + "flos": 26943058103040.0, + "grad_norm": 1.682022100539393, + "language_loss": 0.7870841, + "learning_rate": 2.9862517295003617e-06, + "loss": 0.80920827, + "num_input_tokens_seen": 63568285, + "step": 2959, + "time_per_iteration": 2.517982244491577 + }, + { + "auxiliary_loss_clip": 0.01143212, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.04818618, + "balance_loss_mlp": 1.02129018, + "epoch": 0.3559189562917093, + "flos": 28293335193600.0, + "grad_norm": 1.6866107263385866, + "language_loss": 0.7256726, + "learning_rate": 2.9855739821123654e-06, + "loss": 0.74739945, + "num_input_tokens_seen": 63589865, + "step": 2960, + "time_per_iteration": 2.5814740657806396 + }, + { + "auxiliary_loss_clip": 0.01172072, + "auxiliary_loss_mlp": 0.0102601, + "balance_loss_clip": 1.05543816, + "balance_loss_mlp": 1.01732564, + "epoch": 0.35603919918234833, + "flos": 25664063552640.0, + "grad_norm": 1.795013740185745, + "language_loss": 0.81711829, + "learning_rate": 2.98489608521358e-06, + "loss": 0.83909911, + "num_input_tokens_seen": 63609805, + "step": 2961, + "time_per_iteration": 2.533064365386963 + }, + { + "auxiliary_loss_clip": 0.01181756, + "auxiliary_loss_mlp": 0.00763806, + "balance_loss_clip": 1.05550539, + "balance_loss_mlp": 1.00084913, + "epoch": 0.35615944207298744, + "flos": 23000856537600.0, + "grad_norm": 1.914961886565478, + "language_loss": 0.79416561, + "learning_rate": 2.9842180389068425e-06, + "loss": 0.81362116, + "num_input_tokens_seen": 63627115, + "step": 2962, + "time_per_iteration": 2.484043598175049 + }, + { + "auxiliary_loss_clip": 0.01058521, + "auxiliary_loss_mlp": 0.01005315, + "balance_loss_clip": 1.02967417, + "balance_loss_mlp": 1.00359869, + "epoch": 0.35627968496362655, + "flos": 68251283723520.0, + "grad_norm": 0.7663418270421275, + "language_loss": 0.5926497, + "learning_rate": 2.98353984329501e-06, + "loss": 0.61328804, + "num_input_tokens_seen": 63691460, + "step": 2963, + "time_per_iteration": 3.149522542953491 + }, + { + "auxiliary_loss_clip": 0.01163954, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.05596471, + "balance_loss_mlp": 1.02062416, + "epoch": 0.3563999278542656, + "flos": 22641714403200.0, + "grad_norm": 1.728939880323328, + "language_loss": 0.70818675, + "learning_rate": 2.982861498480965e-06, + "loss": 0.7301209, + "num_input_tokens_seen": 63713840, + "step": 2964, + "time_per_iteration": 2.5470314025878906 + }, + { + "auxiliary_loss_clip": 0.01143081, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.04901266, + "balance_loss_mlp": 1.02518821, + "epoch": 0.3565201707449047, + "flos": 25952533678080.0, + "grad_norm": 1.6449619186871334, + "language_loss": 0.82461715, + "learning_rate": 2.9821830045676122e-06, + "loss": 0.84638464, + "num_input_tokens_seen": 63733540, + "step": 2965, + "time_per_iteration": 2.5800325870513916 + }, + { + "auxiliary_loss_clip": 0.01193302, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.05826831, + "balance_loss_mlp": 1.02447963, + "epoch": 0.3566404136355438, + "flos": 28475725478400.0, + "grad_norm": 1.9779485650648483, + "language_loss": 0.72989655, + "learning_rate": 2.9815043616578793e-06, + "loss": 0.7521534, + "num_input_tokens_seen": 63754335, + "step": 2966, + "time_per_iteration": 2.4848713874816895 + }, + { + "auxiliary_loss_clip": 0.01143199, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.04881489, + "balance_loss_mlp": 1.03006339, + "epoch": 0.3567606565261829, + "flos": 38363117690880.0, + "grad_norm": 2.0899447486052254, + "language_loss": 0.77082103, + "learning_rate": 2.9808255698547145e-06, + "loss": 0.79264092, + "num_input_tokens_seen": 63777135, + "step": 2967, + "time_per_iteration": 2.7020087242126465 + }, + { + "auxiliary_loss_clip": 0.0117731, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.05788755, + "balance_loss_mlp": 1.02135789, + "epoch": 0.356880899416822, + "flos": 21981029592960.0, + "grad_norm": 2.0127120577905826, + "language_loss": 0.79628217, + "learning_rate": 2.9801466292610913e-06, + "loss": 0.81835496, + "num_input_tokens_seen": 63797020, + "step": 2968, + "time_per_iteration": 2.4610321521759033 + }, + { + "auxiliary_loss_clip": 0.01174793, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.05442548, + "balance_loss_mlp": 1.01826167, + "epoch": 0.35700114230746105, + "flos": 18989132198400.0, + "grad_norm": 2.046760480636488, + "language_loss": 0.8116557, + "learning_rate": 2.979467539980003e-06, + "loss": 0.83366632, + "num_input_tokens_seen": 63813810, + "step": 2969, + "time_per_iteration": 2.444014310836792 + }, + { + "auxiliary_loss_clip": 0.01179484, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.05624914, + "balance_loss_mlp": 1.0277735, + "epoch": 0.35712138519810016, + "flos": 19756112330880.0, + "grad_norm": 2.07334719715132, + "language_loss": 0.7704581, + "learning_rate": 2.978788302114468e-06, + "loss": 0.79261422, + "num_input_tokens_seen": 63830925, + "step": 2970, + "time_per_iteration": 2.4472720623016357 + }, + { + "auxiliary_loss_clip": 0.01174011, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.05488682, + "balance_loss_mlp": 1.02411962, + "epoch": 0.35724162808873927, + "flos": 35183012008320.0, + "grad_norm": 1.956911213403874, + "language_loss": 0.83020657, + "learning_rate": 2.9781089157675255e-06, + "loss": 0.85227728, + "num_input_tokens_seen": 63849385, + "step": 2971, + "time_per_iteration": 2.5607309341430664 + }, + { + "auxiliary_loss_clip": 0.01172844, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.05708694, + "balance_loss_mlp": 1.02533662, + "epoch": 0.3573618709793783, + "flos": 25556726736000.0, + "grad_norm": 1.4080445607258598, + "language_loss": 0.88316917, + "learning_rate": 2.977429381042238e-06, + "loss": 0.90523559, + "num_input_tokens_seen": 63870060, + "step": 2972, + "time_per_iteration": 2.490654468536377 + }, + { + "auxiliary_loss_clip": 0.01162222, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.05405188, + "balance_loss_mlp": 1.01970732, + "epoch": 0.35748211387001744, + "flos": 29132352051840.0, + "grad_norm": 2.1795992832952686, + "language_loss": 0.88873875, + "learning_rate": 2.9767496980416913e-06, + "loss": 0.91063571, + "num_input_tokens_seen": 63889355, + "step": 2973, + "time_per_iteration": 2.5249698162078857 + }, + { + "auxiliary_loss_clip": 0.0115881, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.05237639, + "balance_loss_mlp": 1.02408481, + "epoch": 0.35760235676065655, + "flos": 13954169122560.0, + "grad_norm": 3.159873043654318, + "language_loss": 0.80567002, + "learning_rate": 2.9760698668689914e-06, + "loss": 0.82759136, + "num_input_tokens_seen": 63905580, + "step": 2974, + "time_per_iteration": 2.4530551433563232 + }, + { + "auxiliary_loss_clip": 0.0117614, + "auxiliary_loss_mlp": 0.01025103, + "balance_loss_clip": 1.05403423, + "balance_loss_mlp": 1.0172708, + "epoch": 0.3577225996512956, + "flos": 44018688977280.0, + "grad_norm": 1.8480005907925756, + "language_loss": 0.71325302, + "learning_rate": 2.975389887627269e-06, + "loss": 0.73526549, + "num_input_tokens_seen": 63928180, + "step": 2975, + "time_per_iteration": 2.646256685256958 + }, + { + "auxiliary_loss_clip": 0.01152168, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.02555513, + "epoch": 0.3578428425419347, + "flos": 17055199013760.0, + "grad_norm": 2.228301652818403, + "language_loss": 0.90113956, + "learning_rate": 2.9747097604196764e-06, + "loss": 0.9229961, + "num_input_tokens_seen": 63944825, + "step": 2976, + "time_per_iteration": 3.3176894187927246 + }, + { + "auxiliary_loss_clip": 0.01048579, + "auxiliary_loss_mlp": 0.01002636, + "balance_loss_clip": 1.02354014, + "balance_loss_mlp": 1.00149727, + "epoch": 0.3579630854325738, + "flos": 71676550707840.0, + "grad_norm": 0.6782342843040509, + "language_loss": 0.56674582, + "learning_rate": 2.9740294853493875e-06, + "loss": 0.58725792, + "num_input_tokens_seen": 64016385, + "step": 2977, + "time_per_iteration": 3.346116781234741 + }, + { + "auxiliary_loss_clip": 0.01137286, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.04993582, + "balance_loss_mlp": 1.02073145, + "epoch": 0.3580833283232129, + "flos": 25046651652480.0, + "grad_norm": 2.0318924444903796, + "language_loss": 0.67117083, + "learning_rate": 2.9733490625196008e-06, + "loss": 0.69283187, + "num_input_tokens_seen": 64036245, + "step": 2978, + "time_per_iteration": 2.5833258628845215 + }, + { + "auxiliary_loss_clip": 0.01134314, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.04870224, + "balance_loss_mlp": 1.02261877, + "epoch": 0.358203571213852, + "flos": 13953127628160.0, + "grad_norm": 3.068179925146427, + "language_loss": 0.75730401, + "learning_rate": 2.9726684920335353e-06, + "loss": 0.77894723, + "num_input_tokens_seen": 64054110, + "step": 2979, + "time_per_iteration": 2.497767210006714 + }, + { + "auxiliary_loss_clip": 0.01192178, + "auxiliary_loss_mlp": 0.00763745, + "balance_loss_clip": 1.05593419, + "balance_loss_mlp": 1.00068951, + "epoch": 0.35832381410449105, + "flos": 20302457172480.0, + "grad_norm": 2.2464413454239467, + "language_loss": 0.81978083, + "learning_rate": 2.971987773994432e-06, + "loss": 0.83934009, + "num_input_tokens_seen": 64070295, + "step": 2980, + "time_per_iteration": 2.4605650901794434 + }, + { + "auxiliary_loss_clip": 0.0116609, + "auxiliary_loss_mlp": 0.01024595, + "balance_loss_clip": 1.05126548, + "balance_loss_mlp": 1.0166198, + "epoch": 0.35844405699513016, + "flos": 16983234115200.0, + "grad_norm": 2.0029866291361884, + "language_loss": 0.83118755, + "learning_rate": 2.9713069085055566e-06, + "loss": 0.8530944, + "num_input_tokens_seen": 64088605, + "step": 2981, + "time_per_iteration": 2.476473331451416 + }, + { + "auxiliary_loss_clip": 0.01146202, + "auxiliary_loss_mlp": 0.01025839, + "balance_loss_clip": 1.0511632, + "balance_loss_mlp": 1.01742864, + "epoch": 0.35856429988576927, + "flos": 23216858974080.0, + "grad_norm": 1.9629050954993847, + "language_loss": 0.78956056, + "learning_rate": 2.9706258956701958e-06, + "loss": 0.81128097, + "num_input_tokens_seen": 64108595, + "step": 2982, + "time_per_iteration": 4.155234098434448 + }, + { + "auxiliary_loss_clip": 0.01177914, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.05516458, + "balance_loss_mlp": 1.02272201, + "epoch": 0.3586845427764083, + "flos": 23034576430080.0, + "grad_norm": 2.6952859822389312, + "language_loss": 0.77377182, + "learning_rate": 2.9699447355916575e-06, + "loss": 0.79586053, + "num_input_tokens_seen": 64127405, + "step": 2983, + "time_per_iteration": 2.6154489517211914 + }, + { + "auxiliary_loss_clip": 0.01188941, + "auxiliary_loss_mlp": 0.00763004, + "balance_loss_clip": 1.05645323, + "balance_loss_mlp": 1.00072801, + "epoch": 0.35880478566704743, + "flos": 20010682995840.0, + "grad_norm": 2.0530629640322355, + "language_loss": 0.74110591, + "learning_rate": 2.969263428373275e-06, + "loss": 0.76062536, + "num_input_tokens_seen": 64145755, + "step": 2984, + "time_per_iteration": 2.435987710952759 + }, + { + "auxiliary_loss_clip": 0.01162489, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.05279374, + "balance_loss_mlp": 1.02428913, + "epoch": 0.35892502855768654, + "flos": 13699095667200.0, + "grad_norm": 1.996497047089602, + "language_loss": 0.78986418, + "learning_rate": 2.9685819741184007e-06, + "loss": 0.81181413, + "num_input_tokens_seen": 64164195, + "step": 2985, + "time_per_iteration": 2.4754245281219482 + }, + { + "auxiliary_loss_clip": 0.01140273, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.05050433, + "balance_loss_mlp": 1.02383435, + "epoch": 0.3590452714483256, + "flos": 18114096977280.0, + "grad_norm": 6.0183160916630785, + "language_loss": 0.68161905, + "learning_rate": 2.967900372930411e-06, + "loss": 0.70334518, + "num_input_tokens_seen": 64182705, + "step": 2986, + "time_per_iteration": 2.6570496559143066 + }, + { + "auxiliary_loss_clip": 0.01154199, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.05138612, + "balance_loss_mlp": 1.02426291, + "epoch": 0.3591655143389647, + "flos": 17749352321280.0, + "grad_norm": 3.9236801382411315, + "language_loss": 0.79233557, + "learning_rate": 2.9672186249127046e-06, + "loss": 0.81420475, + "num_input_tokens_seen": 64202170, + "step": 2987, + "time_per_iteration": 2.469970703125 + }, + { + "auxiliary_loss_clip": 0.01160333, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.05470395, + "balance_loss_mlp": 1.02813792, + "epoch": 0.3592857572296038, + "flos": 25224409082880.0, + "grad_norm": 1.9482356707264374, + "language_loss": 0.78970307, + "learning_rate": 2.9665367301687014e-06, + "loss": 0.81166756, + "num_input_tokens_seen": 64220415, + "step": 2988, + "time_per_iteration": 2.5062496662139893 + }, + { + "auxiliary_loss_clip": 0.0115295, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.05173755, + "balance_loss_mlp": 1.0210495, + "epoch": 0.3594060001202429, + "flos": 29384408764800.0, + "grad_norm": 2.1563760533526044, + "language_loss": 0.76961029, + "learning_rate": 2.965854688801845e-06, + "loss": 0.79143089, + "num_input_tokens_seen": 64242475, + "step": 2989, + "time_per_iteration": 2.5602898597717285 + }, + { + "auxiliary_loss_clip": 0.01170718, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.04943419, + "balance_loss_mlp": 1.01852095, + "epoch": 0.359526243010882, + "flos": 17052900543360.0, + "grad_norm": 1.925616359896462, + "language_loss": 0.76317143, + "learning_rate": 2.9651725009156005e-06, + "loss": 0.78514421, + "num_input_tokens_seen": 64260220, + "step": 2990, + "time_per_iteration": 2.4543933868408203 + }, + { + "auxiliary_loss_clip": 0.0115328, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.04994416, + "balance_loss_mlp": 1.02341127, + "epoch": 0.3596464859015211, + "flos": 22965089569920.0, + "grad_norm": 1.5976815336725965, + "language_loss": 0.74403501, + "learning_rate": 2.964490166613454e-06, + "loss": 0.76589024, + "num_input_tokens_seen": 64280145, + "step": 2991, + "time_per_iteration": 2.511255979537964 + }, + { + "auxiliary_loss_clip": 0.0108966, + "auxiliary_loss_mlp": 0.01000824, + "balance_loss_clip": 1.02266264, + "balance_loss_mlp": 0.99978685, + "epoch": 0.35976672879216015, + "flos": 54739462590720.0, + "grad_norm": 0.7590962906847478, + "language_loss": 0.57717937, + "learning_rate": 2.963807685998917e-06, + "loss": 0.59808421, + "num_input_tokens_seen": 64336010, + "step": 2992, + "time_per_iteration": 2.8447141647338867 + }, + { + "auxiliary_loss_clip": 0.01135906, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.04918718, + "balance_loss_mlp": 1.02121711, + "epoch": 0.35988697168279926, + "flos": 43139020901760.0, + "grad_norm": 1.469671895656916, + "language_loss": 0.77936363, + "learning_rate": 2.9631250591755196e-06, + "loss": 0.8010186, + "num_input_tokens_seen": 64358725, + "step": 2993, + "time_per_iteration": 2.7509398460388184 + }, + { + "auxiliary_loss_clip": 0.01156387, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.05406249, + "balance_loss_mlp": 1.02260745, + "epoch": 0.36000721457343837, + "flos": 35845600239360.0, + "grad_norm": 1.6352496961306806, + "language_loss": 0.57549322, + "learning_rate": 2.962442286246817e-06, + "loss": 0.59737372, + "num_input_tokens_seen": 64381555, + "step": 2994, + "time_per_iteration": 2.613842725753784 + }, + { + "auxiliary_loss_clip": 0.01164761, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.05320179, + "balance_loss_mlp": 1.02185488, + "epoch": 0.3601274574640774, + "flos": 18291100222080.0, + "grad_norm": 2.1585049943428656, + "language_loss": 0.69675332, + "learning_rate": 2.9617593673163853e-06, + "loss": 0.71870017, + "num_input_tokens_seen": 64400375, + "step": 2995, + "time_per_iteration": 2.481973648071289 + }, + { + "auxiliary_loss_clip": 0.01162208, + "auxiliary_loss_mlp": 0.01024018, + "balance_loss_clip": 1.0502317, + "balance_loss_mlp": 1.01640046, + "epoch": 0.36024770035471654, + "flos": 13333955961600.0, + "grad_norm": 2.2758988078744617, + "language_loss": 0.77393389, + "learning_rate": 2.9610763024878216e-06, + "loss": 0.79579616, + "num_input_tokens_seen": 64415880, + "step": 2996, + "time_per_iteration": 2.4539296627044678 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.05115676, + "balance_loss_mlp": 1.02703476, + "epoch": 0.3603679432453556, + "flos": 20267013427200.0, + "grad_norm": 1.6758543819927192, + "language_loss": 0.91451299, + "learning_rate": 2.960393091864747e-06, + "loss": 0.93640423, + "num_input_tokens_seen": 64434260, + "step": 2997, + "time_per_iteration": 2.489513635635376 + }, + { + "auxiliary_loss_clip": 0.01162383, + "auxiliary_loss_mlp": 0.01024365, + "balance_loss_clip": 1.05561626, + "balance_loss_mlp": 1.01630664, + "epoch": 0.3604881861359947, + "flos": 22451135817600.0, + "grad_norm": 1.8164362400208431, + "language_loss": 0.74771369, + "learning_rate": 2.959709735550804e-06, + "loss": 0.76958114, + "num_input_tokens_seen": 64453855, + "step": 2998, + "time_per_iteration": 2.49832820892334 + }, + { + "auxiliary_loss_clip": 0.01133679, + "auxiliary_loss_mlp": 0.01025359, + "balance_loss_clip": 1.04901218, + "balance_loss_mlp": 1.01730657, + "epoch": 0.3606084290266338, + "flos": 22054251467520.0, + "grad_norm": 2.3783355842547937, + "language_loss": 0.76083314, + "learning_rate": 2.9590262336496575e-06, + "loss": 0.78242362, + "num_input_tokens_seen": 64473585, + "step": 2999, + "time_per_iteration": 2.5815391540527344 + }, + { + "auxiliary_loss_clip": 0.01142755, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.05298042, + "balance_loss_mlp": 1.02572334, + "epoch": 0.36072867191727287, + "flos": 15632921111040.0, + "grad_norm": 2.0668966412379697, + "language_loss": 0.85315132, + "learning_rate": 2.9583425862649936e-06, + "loss": 0.87492335, + "num_input_tokens_seen": 64491720, + "step": 3000, + "time_per_iteration": 2.5043280124664307 + }, + { + "auxiliary_loss_clip": 0.01190005, + "auxiliary_loss_mlp": 0.01028325, + "balance_loss_clip": 1.05618858, + "balance_loss_mlp": 1.02005816, + "epoch": 0.360848914807912, + "flos": 19677000625920.0, + "grad_norm": 1.9197797231320495, + "language_loss": 0.73385042, + "learning_rate": 2.9576587935005215e-06, + "loss": 0.75603372, + "num_input_tokens_seen": 64509800, + "step": 3001, + "time_per_iteration": 2.433746337890625 + }, + { + "auxiliary_loss_clip": 0.01175934, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.05320883, + "balance_loss_mlp": 1.02019477, + "epoch": 0.3609691576985511, + "flos": 18877808972160.0, + "grad_norm": 2.2343689547062087, + "language_loss": 0.71778381, + "learning_rate": 2.9569748554599713e-06, + "loss": 0.73982882, + "num_input_tokens_seen": 64525410, + "step": 3002, + "time_per_iteration": 2.4360578060150146 + }, + { + "auxiliary_loss_clip": 0.01161677, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.05496955, + "balance_loss_mlp": 1.02442932, + "epoch": 0.36108940058919015, + "flos": 42224088648960.0, + "grad_norm": 4.244675395062702, + "language_loss": 0.73288316, + "learning_rate": 2.956290772247097e-06, + "loss": 0.75482583, + "num_input_tokens_seen": 64544085, + "step": 3003, + "time_per_iteration": 3.464754343032837 + }, + { + "auxiliary_loss_clip": 0.01124095, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.04968941, + "balance_loss_mlp": 1.0259521, + "epoch": 0.36120964347982926, + "flos": 23185150243200.0, + "grad_norm": 1.6944738342495917, + "language_loss": 0.73045415, + "learning_rate": 2.9556065439656724e-06, + "loss": 0.75203419, + "num_input_tokens_seen": 64563135, + "step": 3004, + "time_per_iteration": 2.5628418922424316 + }, + { + "auxiliary_loss_clip": 0.01110862, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.04467463, + "balance_loss_mlp": 1.02067125, + "epoch": 0.36132988637046837, + "flos": 18113055482880.0, + "grad_norm": 2.3794617465351733, + "language_loss": 0.81456739, + "learning_rate": 2.9549221707194952e-06, + "loss": 0.83596849, + "num_input_tokens_seen": 64581985, + "step": 3005, + "time_per_iteration": 2.602739095687866 + }, + { + "auxiliary_loss_clip": 0.01176126, + "auxiliary_loss_mlp": 0.01025335, + "balance_loss_clip": 1.05500448, + "balance_loss_mlp": 1.01719332, + "epoch": 0.3614501292611074, + "flos": 27813101333760.0, + "grad_norm": 1.8741744879628925, + "language_loss": 0.72265118, + "learning_rate": 2.954237652612384e-06, + "loss": 0.74466574, + "num_input_tokens_seen": 64601035, + "step": 3006, + "time_per_iteration": 2.5227699279785156 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01027096, + "balance_loss_clip": 1.05020416, + "balance_loss_mlp": 1.01942468, + "epoch": 0.36157037215174653, + "flos": 22634926732800.0, + "grad_norm": 1.8622469017009153, + "language_loss": 0.84637547, + "learning_rate": 2.9535529897481796e-06, + "loss": 0.86820012, + "num_input_tokens_seen": 64618580, + "step": 3007, + "time_per_iteration": 2.5105478763580322 + }, + { + "auxiliary_loss_clip": 0.01190097, + "auxiliary_loss_mlp": 0.0102716, + "balance_loss_clip": 1.05491745, + "balance_loss_mlp": 1.01880932, + "epoch": 0.36169061504238564, + "flos": 12600839376000.0, + "grad_norm": 2.3578117338497746, + "language_loss": 0.76721716, + "learning_rate": 2.9528681822307446e-06, + "loss": 0.78938973, + "num_input_tokens_seen": 64635430, + "step": 3008, + "time_per_iteration": 3.230964422225952 + }, + { + "auxiliary_loss_clip": 0.01172885, + "auxiliary_loss_mlp": 0.00762667, + "balance_loss_clip": 1.05761623, + "balance_loss_mlp": 1.00062132, + "epoch": 0.3618108579330247, + "flos": 26684644682880.0, + "grad_norm": 2.1050139023473884, + "language_loss": 0.8221277, + "learning_rate": 2.952183230163964e-06, + "loss": 0.84148324, + "num_input_tokens_seen": 64655005, + "step": 3009, + "time_per_iteration": 4.012263774871826 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.05066669, + "balance_loss_mlp": 1.01778948, + "epoch": 0.3619311008236638, + "flos": 22817029708800.0, + "grad_norm": 1.8362056113763947, + "language_loss": 0.72998989, + "learning_rate": 2.9514981336517448e-06, + "loss": 0.75166965, + "num_input_tokens_seen": 64674775, + "step": 3010, + "time_per_iteration": 2.5612552165985107 + }, + { + "auxiliary_loss_clip": 0.01174296, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.05639648, + "balance_loss_mlp": 1.02302694, + "epoch": 0.36205134371430286, + "flos": 25919603884800.0, + "grad_norm": 2.1586767954913344, + "language_loss": 0.81476468, + "learning_rate": 2.950812892798015e-06, + "loss": 0.83682394, + "num_input_tokens_seen": 64695670, + "step": 3011, + "time_per_iteration": 2.4976279735565186 + }, + { + "auxiliary_loss_clip": 0.01129894, + "auxiliary_loss_mlp": 0.00763099, + "balance_loss_clip": 1.0525856, + "balance_loss_mlp": 1.00069559, + "epoch": 0.362171586604942, + "flos": 26139592730880.0, + "grad_norm": 7.740300828705119, + "language_loss": 0.87423897, + "learning_rate": 2.9501275077067256e-06, + "loss": 0.89316893, + "num_input_tokens_seen": 64716290, + "step": 3012, + "time_per_iteration": 2.5927493572235107 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01024346, + "balance_loss_clip": 1.04460859, + "balance_loss_mlp": 1.01646662, + "epoch": 0.3622918294955811, + "flos": 28074208273920.0, + "grad_norm": 1.4297100715115676, + "language_loss": 0.88562202, + "learning_rate": 2.949441978481848e-06, + "loss": 0.90688145, + "num_input_tokens_seen": 64737190, + "step": 3013, + "time_per_iteration": 2.665468215942383 + }, + { + "auxiliary_loss_clip": 0.01149607, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.04914284, + "balance_loss_mlp": 1.02026808, + "epoch": 0.36241207238622014, + "flos": 19828005402240.0, + "grad_norm": 2.4466537836984323, + "language_loss": 0.79897869, + "learning_rate": 2.9487563052273778e-06, + "loss": 0.8207643, + "num_input_tokens_seen": 64753950, + "step": 3014, + "time_per_iteration": 2.503666877746582 + }, + { + "auxiliary_loss_clip": 0.01171805, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.05699706, + "balance_loss_mlp": 1.02327895, + "epoch": 0.36253231527685925, + "flos": 21397158017280.0, + "grad_norm": 1.8469445822922441, + "language_loss": 0.85834885, + "learning_rate": 2.94807048804733e-06, + "loss": 0.88038158, + "num_input_tokens_seen": 64773570, + "step": 3015, + "time_per_iteration": 2.452136278152466 + }, + { + "auxiliary_loss_clip": 0.01148504, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.0476234, + "balance_loss_mlp": 1.01873493, + "epoch": 0.36265255816749836, + "flos": 18362885552640.0, + "grad_norm": 1.859501233370567, + "language_loss": 0.89644897, + "learning_rate": 2.9473845270457434e-06, + "loss": 0.91820973, + "num_input_tokens_seen": 64790385, + "step": 3016, + "time_per_iteration": 2.5093281269073486 + }, + { + "auxiliary_loss_clip": 0.01151882, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.04930615, + "balance_loss_mlp": 1.02036715, + "epoch": 0.3627728010581374, + "flos": 18660046769280.0, + "grad_norm": 2.2555949725112887, + "language_loss": 0.70363128, + "learning_rate": 2.946698422326677e-06, + "loss": 0.72543782, + "num_input_tokens_seen": 64807845, + "step": 3017, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01127007, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.04501915, + "balance_loss_mlp": 1.02031136, + "epoch": 0.36289304394877653, + "flos": 27524272072320.0, + "grad_norm": 2.2585039808337632, + "language_loss": 0.79826194, + "learning_rate": 2.946012173994213e-06, + "loss": 0.81981713, + "num_input_tokens_seen": 64827630, + "step": 3018, + "time_per_iteration": 2.61464262008667 + }, + { + "auxiliary_loss_clip": 0.0116869, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.05535173, + "balance_loss_mlp": 1.01940143, + "epoch": 0.36301328683941564, + "flos": 34533244932480.0, + "grad_norm": 1.5639918027379245, + "language_loss": 0.67593777, + "learning_rate": 2.945325782152454e-06, + "loss": 0.69789964, + "num_input_tokens_seen": 64850665, + "step": 3019, + "time_per_iteration": 2.5781259536743164 + }, + { + "auxiliary_loss_clip": 0.01159809, + "auxiliary_loss_mlp": 0.01024673, + "balance_loss_clip": 1.04933882, + "balance_loss_mlp": 1.01688886, + "epoch": 0.3631335297300547, + "flos": 19025976574080.0, + "grad_norm": 2.47884559230756, + "language_loss": 0.78729206, + "learning_rate": 2.9446392469055257e-06, + "loss": 0.80913687, + "num_input_tokens_seen": 64868700, + "step": 3020, + "time_per_iteration": 2.483788251876831 + }, + { + "auxiliary_loss_clip": 0.01140368, + "auxiliary_loss_mlp": 0.01028218, + "balance_loss_clip": 1.0542922, + "balance_loss_mlp": 1.02014184, + "epoch": 0.3632537726206938, + "flos": 19536769929600.0, + "grad_norm": 1.7360396790473371, + "language_loss": 0.79997796, + "learning_rate": 2.9439525683575745e-06, + "loss": 0.82166386, + "num_input_tokens_seen": 64887620, + "step": 3021, + "time_per_iteration": 2.506786823272705 + }, + { + "auxiliary_loss_clip": 0.011932, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.05840254, + "balance_loss_mlp": 1.02205336, + "epoch": 0.3633740155113329, + "flos": 21068611292160.0, + "grad_norm": 2.3537419886204636, + "language_loss": 0.7490477, + "learning_rate": 2.9432657466127694e-06, + "loss": 0.77128685, + "num_input_tokens_seen": 64907190, + "step": 3022, + "time_per_iteration": 2.429471254348755 + }, + { + "auxiliary_loss_clip": 0.01133027, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.05306089, + "balance_loss_mlp": 1.01889479, + "epoch": 0.36349425840197197, + "flos": 20298722158080.0, + "grad_norm": 1.6351722895222163, + "language_loss": 0.76420426, + "learning_rate": 2.9425787817753007e-06, + "loss": 0.78580511, + "num_input_tokens_seen": 64925850, + "step": 3023, + "time_per_iteration": 2.567260503768921 + }, + { + "auxiliary_loss_clip": 0.01147785, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.05193591, + "balance_loss_mlp": 1.02221227, + "epoch": 0.3636145012926111, + "flos": 29716762331520.0, + "grad_norm": 1.9424965757930548, + "language_loss": 0.71516395, + "learning_rate": 2.94189167394938e-06, + "loss": 0.73694378, + "num_input_tokens_seen": 64948285, + "step": 3024, + "time_per_iteration": 2.604113817214966 + }, + { + "auxiliary_loss_clip": 0.01191209, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.05912459, + "balance_loss_mlp": 1.02292967, + "epoch": 0.3637347441832502, + "flos": 21431847576960.0, + "grad_norm": 1.8814627423968688, + "language_loss": 0.81169844, + "learning_rate": 2.941204423239241e-06, + "loss": 0.83392143, + "num_input_tokens_seen": 64967160, + "step": 3025, + "time_per_iteration": 2.430171251296997 + }, + { + "auxiliary_loss_clip": 0.0117084, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.05368423, + "balance_loss_mlp": 1.02141404, + "epoch": 0.36385498707388925, + "flos": 29533941083520.0, + "grad_norm": 2.0726574161555837, + "language_loss": 0.75780177, + "learning_rate": 2.9405170297491395e-06, + "loss": 0.77980763, + "num_input_tokens_seen": 64987155, + "step": 3026, + "time_per_iteration": 2.5254337787628174 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.00763453, + "balance_loss_clip": 1.05187631, + "balance_loss_mlp": 1.00072742, + "epoch": 0.36397522996452836, + "flos": 22236569925120.0, + "grad_norm": 2.6072112493235218, + "language_loss": 0.80024898, + "learning_rate": 2.939829493583353e-06, + "loss": 0.81898272, + "num_input_tokens_seen": 65003800, + "step": 3027, + "time_per_iteration": 2.594695806503296 + }, + { + "auxiliary_loss_clip": 0.01136742, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.0462985, + "balance_loss_mlp": 1.02015245, + "epoch": 0.3640954728551674, + "flos": 21506505995520.0, + "grad_norm": 2.690110434807627, + "language_loss": 0.83562315, + "learning_rate": 2.939141814846179e-06, + "loss": 0.85727394, + "num_input_tokens_seen": 65021215, + "step": 3028, + "time_per_iteration": 2.5258214473724365 + }, + { + "auxiliary_loss_clip": 0.01160532, + "auxiliary_loss_mlp": 0.01025117, + "balance_loss_clip": 1.05197728, + "balance_loss_mlp": 1.01651645, + "epoch": 0.3642157157458065, + "flos": 17712867081600.0, + "grad_norm": 1.655076761386125, + "language_loss": 0.82297587, + "learning_rate": 2.938453993641938e-06, + "loss": 0.84483242, + "num_input_tokens_seen": 65039590, + "step": 3029, + "time_per_iteration": 3.347519636154175 + }, + { + "auxiliary_loss_clip": 0.01161037, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.05648255, + "balance_loss_mlp": 1.02283382, + "epoch": 0.36433595863644563, + "flos": 17639537466240.0, + "grad_norm": 2.0226674329179355, + "language_loss": 0.70361483, + "learning_rate": 2.937766030074973e-06, + "loss": 0.72553992, + "num_input_tokens_seen": 65056845, + "step": 3030, + "time_per_iteration": 2.472032308578491 + }, + { + "auxiliary_loss_clip": 0.01151273, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.05215549, + "balance_loss_mlp": 1.02192557, + "epoch": 0.3644562015270847, + "flos": 26833279161600.0, + "grad_norm": 1.7956553124666847, + "language_loss": 0.8272683, + "learning_rate": 2.937077924249646e-06, + "loss": 0.8490811, + "num_input_tokens_seen": 65079435, + "step": 3031, + "time_per_iteration": 2.582489490509033 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01027589, + "balance_loss_clip": 1.05304193, + "balance_loss_mlp": 1.01915455, + "epoch": 0.3645764444177238, + "flos": 14282715847680.0, + "grad_norm": 2.079976720707225, + "language_loss": 0.76207024, + "learning_rate": 2.9363896762703443e-06, + "loss": 0.7840054, + "num_input_tokens_seen": 65096500, + "step": 3032, + "time_per_iteration": 2.459040880203247 + }, + { + "auxiliary_loss_clip": 0.01189664, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.05624104, + "balance_loss_mlp": 1.02109265, + "epoch": 0.3646966873083629, + "flos": 20667489137280.0, + "grad_norm": 1.6881177954387239, + "language_loss": 0.84259558, + "learning_rate": 2.9357012862414725e-06, + "loss": 0.86479485, + "num_input_tokens_seen": 65115860, + "step": 3033, + "time_per_iteration": 2.466895818710327 + }, + { + "auxiliary_loss_clip": 0.01174257, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.05515027, + "balance_loss_mlp": 1.02252591, + "epoch": 0.36481693019900197, + "flos": 27782613665280.0, + "grad_norm": 1.8057515910878763, + "language_loss": 0.71681011, + "learning_rate": 2.9350127542674593e-06, + "loss": 0.73886263, + "num_input_tokens_seen": 65138070, + "step": 3034, + "time_per_iteration": 2.518522262573242 + }, + { + "auxiliary_loss_clip": 0.01168921, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.05580187, + "balance_loss_mlp": 1.02570701, + "epoch": 0.3649371730896411, + "flos": 19712588025600.0, + "grad_norm": 1.7788645567277077, + "language_loss": 0.76612991, + "learning_rate": 2.934324080452755e-06, + "loss": 0.7881583, + "num_input_tokens_seen": 65155860, + "step": 3035, + "time_per_iteration": 3.360319137573242 + }, + { + "auxiliary_loss_clip": 0.01135732, + "auxiliary_loss_mlp": 0.00763618, + "balance_loss_clip": 1.0464834, + "balance_loss_mlp": 1.00079393, + "epoch": 0.3650574159802802, + "flos": 24750496016640.0, + "grad_norm": 1.4653325612181396, + "language_loss": 0.78154784, + "learning_rate": 2.9336352649018307e-06, + "loss": 0.80054134, + "num_input_tokens_seen": 65175930, + "step": 3036, + "time_per_iteration": 3.4139373302459717 + }, + { + "auxiliary_loss_clip": 0.01163501, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.05402935, + "balance_loss_mlp": 1.02508688, + "epoch": 0.36517765887091924, + "flos": 32853487363200.0, + "grad_norm": 2.170677541433056, + "language_loss": 0.70108736, + "learning_rate": 2.9329463077191783e-06, + "loss": 0.72305477, + "num_input_tokens_seen": 65199305, + "step": 3037, + "time_per_iteration": 2.6024792194366455 + }, + { + "auxiliary_loss_clip": 0.01131867, + "auxiliary_loss_mlp": 0.01023347, + "balance_loss_clip": 1.05067217, + "balance_loss_mlp": 1.01478767, + "epoch": 0.36529790176155835, + "flos": 20120318282880.0, + "grad_norm": 2.21491010429096, + "language_loss": 0.63917327, + "learning_rate": 2.9322572090093135e-06, + "loss": 0.66072547, + "num_input_tokens_seen": 65218010, + "step": 3038, + "time_per_iteration": 2.5622024536132812 + }, + { + "auxiliary_loss_clip": 0.01132215, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.04761934, + "balance_loss_mlp": 1.02286851, + "epoch": 0.36541814465219746, + "flos": 17639573379840.0, + "grad_norm": 3.135063262481432, + "language_loss": 0.76357126, + "learning_rate": 2.9315679688767713e-06, + "loss": 0.78520846, + "num_input_tokens_seen": 65236020, + "step": 3039, + "time_per_iteration": 2.531160354614258 + }, + { + "auxiliary_loss_clip": 0.01155675, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.05148506, + "balance_loss_mlp": 1.02046919, + "epoch": 0.3655383875428365, + "flos": 22674356887680.0, + "grad_norm": 1.5834482963541567, + "language_loss": 0.66304111, + "learning_rate": 2.9308785874261085e-06, + "loss": 0.68488538, + "num_input_tokens_seen": 65256210, + "step": 3040, + "time_per_iteration": 2.5264980792999268 + }, + { + "auxiliary_loss_clip": 0.01191947, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.05893612, + "balance_loss_mlp": 1.02393413, + "epoch": 0.36565863043347563, + "flos": 21981173247360.0, + "grad_norm": 1.7571565599158954, + "language_loss": 0.81965959, + "learning_rate": 2.9301890647619045e-06, + "loss": 0.84189957, + "num_input_tokens_seen": 65275505, + "step": 3041, + "time_per_iteration": 2.4477994441986084 + }, + { + "auxiliary_loss_clip": 0.01169142, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.05534339, + "balance_loss_mlp": 1.02774668, + "epoch": 0.36577887332411474, + "flos": 24827632473600.0, + "grad_norm": 1.8183491614990759, + "language_loss": 0.8045131, + "learning_rate": 2.929499400988759e-06, + "loss": 0.82657367, + "num_input_tokens_seen": 65296665, + "step": 3042, + "time_per_iteration": 2.5319924354553223 + }, + { + "auxiliary_loss_clip": 0.01175415, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.05617571, + "balance_loss_mlp": 1.02664185, + "epoch": 0.3658991162147538, + "flos": 28293191539200.0, + "grad_norm": 1.957000854596714, + "language_loss": 0.65246022, + "learning_rate": 2.9288095962112927e-06, + "loss": 0.67457116, + "num_input_tokens_seen": 65317370, + "step": 3043, + "time_per_iteration": 2.5277597904205322 + }, + { + "auxiliary_loss_clip": 0.01190413, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.05742955, + "balance_loss_mlp": 1.02027154, + "epoch": 0.3660193591053929, + "flos": 17785550252160.0, + "grad_norm": 1.8756394006013428, + "language_loss": 0.84880692, + "learning_rate": 2.9281196505341503e-06, + "loss": 0.87100029, + "num_input_tokens_seen": 65334540, + "step": 3044, + "time_per_iteration": 2.408989429473877 + }, + { + "auxiliary_loss_clip": 0.01126781, + "auxiliary_loss_mlp": 0.00763246, + "balance_loss_clip": 1.05176497, + "balance_loss_mlp": 1.00071049, + "epoch": 0.36613960199603196, + "flos": 10342776839040.0, + "grad_norm": 2.169158605704409, + "language_loss": 0.78737581, + "learning_rate": 2.9274295640619946e-06, + "loss": 0.80627608, + "num_input_tokens_seen": 65351670, + "step": 3045, + "time_per_iteration": 2.5475940704345703 + }, + { + "auxiliary_loss_clip": 0.01145906, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.05023766, + "balance_loss_mlp": 1.01859498, + "epoch": 0.36625984488667107, + "flos": 19755609540480.0, + "grad_norm": 7.06908958868098, + "language_loss": 0.78218085, + "learning_rate": 2.9267393368995103e-06, + "loss": 0.80390322, + "num_input_tokens_seen": 65370900, + "step": 3046, + "time_per_iteration": 2.515636444091797 + }, + { + "auxiliary_loss_clip": 0.01192896, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.05931115, + "balance_loss_mlp": 1.02339768, + "epoch": 0.3663800877773102, + "flos": 17674262939520.0, + "grad_norm": 2.106238838173772, + "language_loss": 0.73751926, + "learning_rate": 2.926048969151407e-06, + "loss": 0.75976467, + "num_input_tokens_seen": 65388185, + "step": 3047, + "time_per_iteration": 2.4002580642700195 + }, + { + "auxiliary_loss_clip": 0.0113082, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.05459344, + "balance_loss_mlp": 1.01987267, + "epoch": 0.36650033066794924, + "flos": 20303606407680.0, + "grad_norm": 1.9309487664588667, + "language_loss": 0.68730938, + "learning_rate": 2.92535846092241e-06, + "loss": 0.70890272, + "num_input_tokens_seen": 65407200, + "step": 3048, + "time_per_iteration": 2.5554943084716797 + }, + { + "auxiliary_loss_clip": 0.01165626, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.05683959, + "balance_loss_mlp": 1.02328801, + "epoch": 0.36662057355858835, + "flos": 24716237420160.0, + "grad_norm": 2.3744138426062364, + "language_loss": 0.82654011, + "learning_rate": 2.9246678123172704e-06, + "loss": 0.84851193, + "num_input_tokens_seen": 65427290, + "step": 3049, + "time_per_iteration": 2.5517947673797607 + }, + { + "auxiliary_loss_clip": 0.01192093, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.05754066, + "balance_loss_mlp": 1.02699268, + "epoch": 0.36674081644922746, + "flos": 12385267902720.0, + "grad_norm": 2.156203795964543, + "language_loss": 0.74395168, + "learning_rate": 2.9239770234407596e-06, + "loss": 0.76622999, + "num_input_tokens_seen": 65445595, + "step": 3050, + "time_per_iteration": 2.405888557434082 + }, + { + "auxiliary_loss_clip": 0.01176614, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.05471969, + "balance_loss_mlp": 1.01870859, + "epoch": 0.3668610593398665, + "flos": 21105922544640.0, + "grad_norm": 1.5836866752241445, + "language_loss": 0.68324101, + "learning_rate": 2.9232860943976686e-06, + "loss": 0.7052812, + "num_input_tokens_seen": 65466330, + "step": 3051, + "time_per_iteration": 2.472597360610962 + }, + { + "auxiliary_loss_clip": 0.01159864, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.05535579, + "balance_loss_mlp": 1.01712132, + "epoch": 0.3669813022305056, + "flos": 26758082039040.0, + "grad_norm": 1.6676428651189341, + "language_loss": 0.83898771, + "learning_rate": 2.9225950252928115e-06, + "loss": 0.86083931, + "num_input_tokens_seen": 65487180, + "step": 3052, + "time_per_iteration": 2.5359652042388916 + }, + { + "auxiliary_loss_clip": 0.01178571, + "auxiliary_loss_mlp": 0.00763803, + "balance_loss_clip": 1.0590893, + "balance_loss_mlp": 1.0006907, + "epoch": 0.36710154512114473, + "flos": 19099521671040.0, + "grad_norm": 2.181883091419619, + "language_loss": 0.81489873, + "learning_rate": 2.9219038162310217e-06, + "loss": 0.83432251, + "num_input_tokens_seen": 65505380, + "step": 3053, + "time_per_iteration": 2.4488818645477295 + }, + { + "auxiliary_loss_clip": 0.01109769, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.05098641, + "balance_loss_mlp": 1.02443075, + "epoch": 0.3672217880117838, + "flos": 20812029465600.0, + "grad_norm": 2.055129007854651, + "language_loss": 0.8281616, + "learning_rate": 2.921212467317157e-06, + "loss": 0.84958726, + "num_input_tokens_seen": 65524825, + "step": 3054, + "time_per_iteration": 2.6163485050201416 + }, + { + "auxiliary_loss_clip": 0.01149447, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.05147016, + "balance_loss_mlp": 1.02282739, + "epoch": 0.3673420309024229, + "flos": 13590394133760.0, + "grad_norm": 1.8860546042439028, + "language_loss": 0.80125493, + "learning_rate": 2.920520978656093e-06, + "loss": 0.82306629, + "num_input_tokens_seen": 65541790, + "step": 3055, + "time_per_iteration": 2.4380366802215576 + }, + { + "auxiliary_loss_clip": 0.01187204, + "auxiliary_loss_mlp": 0.00763264, + "balance_loss_clip": 1.05628812, + "balance_loss_mlp": 1.00071919, + "epoch": 0.367462273793062, + "flos": 28986877969920.0, + "grad_norm": 1.9361724785641345, + "language_loss": 0.7692529, + "learning_rate": 2.919829350352729e-06, + "loss": 0.78875756, + "num_input_tokens_seen": 65563395, + "step": 3056, + "time_per_iteration": 3.316824436187744 + }, + { + "auxiliary_loss_clip": 0.01096426, + "auxiliary_loss_mlp": 0.01009773, + "balance_loss_clip": 1.02844, + "balance_loss_mlp": 1.00865889, + "epoch": 0.36758251668370107, + "flos": 62643148346880.0, + "grad_norm": 0.7570547829826064, + "language_loss": 0.60030913, + "learning_rate": 2.919137582511983e-06, + "loss": 0.62137109, + "num_input_tokens_seen": 65619835, + "step": 3057, + "time_per_iteration": 2.945138931274414 + }, + { + "auxiliary_loss_clip": 0.011577, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.05977607, + "balance_loss_mlp": 1.02150011, + "epoch": 0.3677027595743402, + "flos": 12713886455040.0, + "grad_norm": 1.8878518158817985, + "language_loss": 0.64157999, + "learning_rate": 2.918445675238797e-06, + "loss": 0.66345227, + "num_input_tokens_seen": 65636760, + "step": 3058, + "time_per_iteration": 2.4971792697906494 + }, + { + "auxiliary_loss_clip": 0.01191634, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.05725479, + "balance_loss_mlp": 1.01840043, + "epoch": 0.36782300246497923, + "flos": 25046579825280.0, + "grad_norm": 1.7956780496879863, + "language_loss": 0.69444293, + "learning_rate": 2.917753628638132e-06, + "loss": 0.71662921, + "num_input_tokens_seen": 65657065, + "step": 3059, + "time_per_iteration": 2.475940465927124 + }, + { + "auxiliary_loss_clip": 0.01163311, + "auxiliary_loss_mlp": 0.01027772, + "balance_loss_clip": 1.05672216, + "balance_loss_mlp": 1.01886678, + "epoch": 0.36794324535561834, + "flos": 17419512706560.0, + "grad_norm": 2.0690827775904648, + "language_loss": 0.70161605, + "learning_rate": 2.9170614428149716e-06, + "loss": 0.72352684, + "num_input_tokens_seen": 65675400, + "step": 3060, + "time_per_iteration": 2.4628260135650635 + }, + { + "auxiliary_loss_clip": 0.01144074, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.05368018, + "balance_loss_mlp": 1.02645516, + "epoch": 0.36806348824625745, + "flos": 24089128848000.0, + "grad_norm": 3.063828600476903, + "language_loss": 0.86817425, + "learning_rate": 2.9163691178743195e-06, + "loss": 0.88997173, + "num_input_tokens_seen": 65694050, + "step": 3061, + "time_per_iteration": 3.4269320964813232 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.05546153, + "balance_loss_mlp": 1.0203371, + "epoch": 0.3681837311368965, + "flos": 20521871400960.0, + "grad_norm": 1.7580222715142169, + "language_loss": 0.7723487, + "learning_rate": 2.9156766539212006e-06, + "loss": 0.79435992, + "num_input_tokens_seen": 65711695, + "step": 3062, + "time_per_iteration": 3.234375 + }, + { + "auxiliary_loss_clip": 0.01178483, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.05431294, + "balance_loss_mlp": 1.03173709, + "epoch": 0.3683039740275356, + "flos": 21466644877440.0, + "grad_norm": 1.8889714124636634, + "language_loss": 0.71276712, + "learning_rate": 2.9149840510606614e-06, + "loss": 0.73495656, + "num_input_tokens_seen": 65730350, + "step": 3063, + "time_per_iteration": 2.477159261703491 + }, + { + "auxiliary_loss_clip": 0.01080107, + "auxiliary_loss_mlp": 0.00754051, + "balance_loss_clip": 1.02423072, + "balance_loss_mlp": 1.0013026, + "epoch": 0.36842421691817473, + "flos": 70380999987840.0, + "grad_norm": 1.026092408576825, + "language_loss": 0.64183807, + "learning_rate": 2.914291309397769e-06, + "loss": 0.66017962, + "num_input_tokens_seen": 65787820, + "step": 3064, + "time_per_iteration": 3.151080846786499 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.04762554, + "balance_loss_mlp": 1.01931953, + "epoch": 0.3685444598088138, + "flos": 23331378510720.0, + "grad_norm": 2.2887398058454718, + "language_loss": 0.78912234, + "learning_rate": 2.9135984290376117e-06, + "loss": 0.81048763, + "num_input_tokens_seen": 65806685, + "step": 3065, + "time_per_iteration": 2.6122238636016846 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.04797053, + "balance_loss_mlp": 1.02332294, + "epoch": 0.3686647026994529, + "flos": 23070271570560.0, + "grad_norm": 1.683605623804839, + "language_loss": 0.82786429, + "learning_rate": 2.9129054100853e-06, + "loss": 0.84934103, + "num_input_tokens_seen": 65825525, + "step": 3066, + "time_per_iteration": 2.601835012435913 + }, + { + "auxiliary_loss_clip": 0.01162742, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.05429351, + "balance_loss_mlp": 1.01949191, + "epoch": 0.368784945590092, + "flos": 25119909440640.0, + "grad_norm": 1.6604839501720707, + "language_loss": 0.75971091, + "learning_rate": 2.912212252645963e-06, + "loss": 0.78161895, + "num_input_tokens_seen": 65848110, + "step": 3067, + "time_per_iteration": 2.551729679107666 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.05453181, + "balance_loss_mlp": 1.02104223, + "epoch": 0.36890518848073106, + "flos": 18442284566400.0, + "grad_norm": 2.0223062882198777, + "language_loss": 0.76295191, + "learning_rate": 2.9115189568247523e-06, + "loss": 0.78505194, + "num_input_tokens_seen": 65865670, + "step": 3068, + "time_per_iteration": 2.447573184967041 + }, + { + "auxiliary_loss_clip": 0.0112468, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.0548141, + "balance_loss_mlp": 1.02363145, + "epoch": 0.36902543137137017, + "flos": 16362446336640.0, + "grad_norm": 1.9020381572551877, + "language_loss": 0.92165548, + "learning_rate": 2.910825522726841e-06, + "loss": 0.94322485, + "num_input_tokens_seen": 65883195, + "step": 3069, + "time_per_iteration": 2.52885103225708 + }, + { + "auxiliary_loss_clip": 0.01125887, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.04700148, + "balance_loss_mlp": 1.02358055, + "epoch": 0.3691456742620093, + "flos": 12275596702080.0, + "grad_norm": 1.9053274525265584, + "language_loss": 0.77213216, + "learning_rate": 2.9101319504574215e-06, + "loss": 0.79371142, + "num_input_tokens_seen": 65899635, + "step": 3070, + "time_per_iteration": 2.5252184867858887 + }, + { + "auxiliary_loss_clip": 0.01164673, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.05078316, + "balance_loss_mlp": 1.02218807, + "epoch": 0.36926591715264834, + "flos": 17786412178560.0, + "grad_norm": 1.8746489748561939, + "language_loss": 0.76626921, + "learning_rate": 2.909438240121709e-06, + "loss": 0.78822857, + "num_input_tokens_seen": 65919910, + "step": 3071, + "time_per_iteration": 2.4977846145629883 + }, + { + "auxiliary_loss_clip": 0.01154717, + "auxiliary_loss_mlp": 0.01025087, + "balance_loss_clip": 1.05415845, + "balance_loss_mlp": 1.01650381, + "epoch": 0.36938616004328745, + "flos": 28948309741440.0, + "grad_norm": 1.740603464270166, + "language_loss": 0.70372581, + "learning_rate": 2.908744391824939e-06, + "loss": 0.72552383, + "num_input_tokens_seen": 65940930, + "step": 3072, + "time_per_iteration": 2.576730728149414 + }, + { + "auxiliary_loss_clip": 0.01119397, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.04758203, + "balance_loss_mlp": 1.01906002, + "epoch": 0.36950640293392656, + "flos": 29205394358400.0, + "grad_norm": 1.6289692631132964, + "language_loss": 0.78989983, + "learning_rate": 2.908050405672367e-06, + "loss": 0.81137145, + "num_input_tokens_seen": 65960475, + "step": 3073, + "time_per_iteration": 2.6455554962158203 + }, + { + "auxiliary_loss_clip": 0.01164723, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.05042887, + "balance_loss_mlp": 1.02133775, + "epoch": 0.3696266458245656, + "flos": 24827776128000.0, + "grad_norm": 1.7226198747933599, + "language_loss": 0.7927829, + "learning_rate": 2.9073562817692703e-06, + "loss": 0.81473047, + "num_input_tokens_seen": 65979160, + "step": 3074, + "time_per_iteration": 2.52937912940979 + }, + { + "auxiliary_loss_clip": 0.0105101, + "auxiliary_loss_mlp": 0.01005299, + "balance_loss_clip": 1.02100444, + "balance_loss_mlp": 1.00398815, + "epoch": 0.3697468887152047, + "flos": 59887257264000.0, + "grad_norm": 0.7234587820058428, + "language_loss": 0.56523782, + "learning_rate": 2.9066620202209468e-06, + "loss": 0.58580089, + "num_input_tokens_seen": 66041650, + "step": 3075, + "time_per_iteration": 3.073305368423462 + }, + { + "auxiliary_loss_clip": 0.01137561, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.0513308, + "balance_loss_mlp": 1.01965511, + "epoch": 0.3698671316058438, + "flos": 26137581569280.0, + "grad_norm": 1.901466344564647, + "language_loss": 0.77521539, + "learning_rate": 2.905967621132716e-06, + "loss": 0.7968713, + "num_input_tokens_seen": 66059260, + "step": 3076, + "time_per_iteration": 2.55488920211792 + }, + { + "auxiliary_loss_clip": 0.01164719, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.05217421, + "balance_loss_mlp": 1.02199626, + "epoch": 0.3699873744964829, + "flos": 24607464059520.0, + "grad_norm": 1.7925147239513082, + "language_loss": 0.75092286, + "learning_rate": 2.9052730846099172e-06, + "loss": 0.77287543, + "num_input_tokens_seen": 66080605, + "step": 3077, + "time_per_iteration": 2.532503843307495 + }, + { + "auxiliary_loss_clip": 0.01067245, + "auxiliary_loss_mlp": 0.0100115, + "balance_loss_clip": 1.02181292, + "balance_loss_mlp": 0.99992198, + "epoch": 0.370107617387122, + "flos": 64885340050560.0, + "grad_norm": 0.8510180624822664, + "language_loss": 0.60826206, + "learning_rate": 2.9045784107579123e-06, + "loss": 0.62894601, + "num_input_tokens_seen": 66140710, + "step": 3078, + "time_per_iteration": 3.093737840652466 + }, + { + "auxiliary_loss_clip": 0.01188915, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.05695796, + "balance_loss_mlp": 1.01915979, + "epoch": 0.37022786027776106, + "flos": 15961683317760.0, + "grad_norm": 2.928882061265177, + "language_loss": 0.66829419, + "learning_rate": 2.9038835996820807e-06, + "loss": 0.69046259, + "num_input_tokens_seen": 66158320, + "step": 3079, + "time_per_iteration": 2.416414737701416 + }, + { + "auxiliary_loss_clip": 0.01149601, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.04854727, + "balance_loss_mlp": 1.02205062, + "epoch": 0.37034810316840017, + "flos": 18546927863040.0, + "grad_norm": 1.776021176644969, + "language_loss": 0.79789054, + "learning_rate": 2.903188651487826e-06, + "loss": 0.81969607, + "num_input_tokens_seen": 66176875, + "step": 3080, + "time_per_iteration": 2.505650758743286 + }, + { + "auxiliary_loss_clip": 0.01179263, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.05684805, + "balance_loss_mlp": 1.02081203, + "epoch": 0.3704683460590393, + "flos": 17821927751040.0, + "grad_norm": 2.434656837212417, + "language_loss": 0.86916959, + "learning_rate": 2.902493566280571e-06, + "loss": 0.89125919, + "num_input_tokens_seen": 66194980, + "step": 3081, + "time_per_iteration": 2.4463279247283936 + }, + { + "auxiliary_loss_clip": 0.01159309, + "auxiliary_loss_mlp": 0.01027246, + "balance_loss_clip": 1.0534687, + "balance_loss_mlp": 1.01810908, + "epoch": 0.37058858894967833, + "flos": 14134081368960.0, + "grad_norm": 8.847284099012898, + "language_loss": 0.81579638, + "learning_rate": 2.9017983441657595e-06, + "loss": 0.83766186, + "num_input_tokens_seen": 66212310, + "step": 3082, + "time_per_iteration": 2.4657857418060303 + }, + { + "auxiliary_loss_clip": 0.01131881, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.0462513, + "balance_loss_mlp": 1.01853573, + "epoch": 0.37070883184031744, + "flos": 13954492344960.0, + "grad_norm": 2.7018465906506113, + "language_loss": 0.75281215, + "learning_rate": 2.9011029852488564e-06, + "loss": 0.77440453, + "num_input_tokens_seen": 66229545, + "step": 3083, + "time_per_iteration": 3.397322654724121 + }, + { + "auxiliary_loss_clip": 0.01084857, + "auxiliary_loss_mlp": 0.01000838, + "balance_loss_clip": 1.01707315, + "balance_loss_mlp": 0.99968141, + "epoch": 0.37082907473095655, + "flos": 52315419306240.0, + "grad_norm": 0.9860272000605642, + "language_loss": 0.62533206, + "learning_rate": 2.9004074896353465e-06, + "loss": 0.64618897, + "num_input_tokens_seen": 66283545, + "step": 3084, + "time_per_iteration": 2.960568904876709 + }, + { + "auxiliary_loss_clip": 0.0119141, + "auxiliary_loss_mlp": 0.01026881, + "balance_loss_clip": 1.06215191, + "balance_loss_mlp": 1.01940107, + "epoch": 0.3709493176215956, + "flos": 15998096730240.0, + "grad_norm": 1.7764345363015674, + "language_loss": 0.81793094, + "learning_rate": 2.8997118574307362e-06, + "loss": 0.84011394, + "num_input_tokens_seen": 66300500, + "step": 3085, + "time_per_iteration": 2.4666192531585693 + }, + { + "auxiliary_loss_clip": 0.01152056, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.05247355, + "balance_loss_mlp": 1.02201104, + "epoch": 0.3710695605122347, + "flos": 20959837931520.0, + "grad_norm": 2.2091659403696626, + "language_loss": 0.74374753, + "learning_rate": 2.899016088740553e-06, + "loss": 0.76557302, + "num_input_tokens_seen": 66318610, + "step": 3086, + "time_per_iteration": 2.5196192264556885 + }, + { + "auxiliary_loss_clip": 0.01129173, + "auxiliary_loss_mlp": 0.01025001, + "balance_loss_clip": 1.04856503, + "balance_loss_mlp": 1.0173955, + "epoch": 0.37118980340287383, + "flos": 14355578586240.0, + "grad_norm": 1.7841428804893666, + "language_loss": 0.78998601, + "learning_rate": 2.898320183670344e-06, + "loss": 0.81152773, + "num_input_tokens_seen": 66336025, + "step": 3087, + "time_per_iteration": 3.3778507709503174 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.05426693, + "balance_loss_mlp": 1.02101898, + "epoch": 0.3713100462935129, + "flos": 25885381201920.0, + "grad_norm": 1.8573648256064648, + "language_loss": 0.88930058, + "learning_rate": 2.8976241423256767e-06, + "loss": 0.91092527, + "num_input_tokens_seen": 66356120, + "step": 3088, + "time_per_iteration": 4.074788331985474 + }, + { + "auxiliary_loss_clip": 0.01153462, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.05084324, + "balance_loss_mlp": 1.02316236, + "epoch": 0.371430289184152, + "flos": 30518934814080.0, + "grad_norm": 1.8199088292121155, + "language_loss": 0.67931402, + "learning_rate": 2.896927964812142e-06, + "loss": 0.70115912, + "num_input_tokens_seen": 66376685, + "step": 3089, + "time_per_iteration": 2.5718977451324463 + }, + { + "auxiliary_loss_clip": 0.01160492, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.0568192, + "balance_loss_mlp": 1.02039194, + "epoch": 0.37155053207479105, + "flos": 15742233175680.0, + "grad_norm": 2.3323219440267846, + "language_loss": 0.75253165, + "learning_rate": 2.8962316512353465e-06, + "loss": 0.77443093, + "num_input_tokens_seen": 66394230, + "step": 3090, + "time_per_iteration": 2.484325647354126 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.04433692, + "balance_loss_mlp": 1.02259326, + "epoch": 0.37167077496543016, + "flos": 23404061681280.0, + "grad_norm": 1.5289820764494213, + "language_loss": 0.74942881, + "learning_rate": 2.8955352017009233e-06, + "loss": 0.77085841, + "num_input_tokens_seen": 66413475, + "step": 3091, + "time_per_iteration": 2.6065900325775146 + }, + { + "auxiliary_loss_clip": 0.01160184, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.0563153, + "balance_loss_mlp": 1.023054, + "epoch": 0.3717910178560693, + "flos": 22088653718400.0, + "grad_norm": 3.1642929808981046, + "language_loss": 0.77126801, + "learning_rate": 2.8948386163145212e-06, + "loss": 0.79318821, + "num_input_tokens_seen": 66432685, + "step": 3092, + "time_per_iteration": 2.5059359073638916 + }, + { + "auxiliary_loss_clip": 0.01179229, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.05617952, + "balance_loss_mlp": 1.02188253, + "epoch": 0.3719112607467083, + "flos": 26939969533440.0, + "grad_norm": 1.8237267847060572, + "language_loss": 0.79071808, + "learning_rate": 2.8941418951818135e-06, + "loss": 0.81281215, + "num_input_tokens_seen": 66452245, + "step": 3093, + "time_per_iteration": 2.4883203506469727 + }, + { + "auxiliary_loss_clip": 0.01146028, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.05118442, + "balance_loss_mlp": 1.02492309, + "epoch": 0.37203150363734744, + "flos": 12166500119040.0, + "grad_norm": 2.3930619083808815, + "language_loss": 0.71113873, + "learning_rate": 2.8934450384084903e-06, + "loss": 0.7329272, + "num_input_tokens_seen": 66469760, + "step": 3094, + "time_per_iteration": 2.5112884044647217 + }, + { + "auxiliary_loss_clip": 0.01155341, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.05298972, + "balance_loss_mlp": 1.02022672, + "epoch": 0.37215174652798655, + "flos": 23697595624320.0, + "grad_norm": 1.8284074983025054, + "language_loss": 0.69826859, + "learning_rate": 2.8927480461002653e-06, + "loss": 0.72011077, + "num_input_tokens_seen": 66489730, + "step": 3095, + "time_per_iteration": 2.5354344844818115 + }, + { + "auxiliary_loss_clip": 0.01159411, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.05086005, + "balance_loss_mlp": 1.02761877, + "epoch": 0.3722719894186256, + "flos": 17887751424000.0, + "grad_norm": 3.192339297479285, + "language_loss": 0.86417294, + "learning_rate": 2.892050918362872e-06, + "loss": 0.88613892, + "num_input_tokens_seen": 66504785, + "step": 3096, + "time_per_iteration": 2.472733974456787 + }, + { + "auxiliary_loss_clip": 0.010186, + "auxiliary_loss_mlp": 0.01003817, + "balance_loss_clip": 1.01639569, + "balance_loss_mlp": 1.00242794, + "epoch": 0.3723922323092647, + "flos": 62419891363200.0, + "grad_norm": 0.8496562944322783, + "language_loss": 0.55903089, + "learning_rate": 2.8913536553020626e-06, + "loss": 0.5792551, + "num_input_tokens_seen": 66558840, + "step": 3097, + "time_per_iteration": 3.5324528217315674 + }, + { + "auxiliary_loss_clip": 0.01123517, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.04819775, + "balance_loss_mlp": 1.01969504, + "epoch": 0.3725124751999038, + "flos": 23039747988480.0, + "grad_norm": 2.1081206833218884, + "language_loss": 0.84853786, + "learning_rate": 2.8906562570236137e-06, + "loss": 0.87005162, + "num_input_tokens_seen": 66576750, + "step": 3098, + "time_per_iteration": 2.938476324081421 + }, + { + "auxiliary_loss_clip": 0.01112249, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.04490685, + "balance_loss_mlp": 1.02617621, + "epoch": 0.3726327180905429, + "flos": 20920551431040.0, + "grad_norm": 1.4888644190420457, + "language_loss": 0.76502621, + "learning_rate": 2.889958723633318e-06, + "loss": 0.78648865, + "num_input_tokens_seen": 66595690, + "step": 3099, + "time_per_iteration": 2.6059322357177734 + }, + { + "auxiliary_loss_clip": 0.01145617, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.05128145, + "balance_loss_mlp": 1.01993418, + "epoch": 0.372752960981182, + "flos": 30592156688640.0, + "grad_norm": 1.852895213674984, + "language_loss": 0.73783535, + "learning_rate": 2.889261055236992e-06, + "loss": 0.75957549, + "num_input_tokens_seen": 66617905, + "step": 3100, + "time_per_iteration": 2.6023433208465576 + }, + { + "auxiliary_loss_clip": 0.01157703, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.0545969, + "balance_loss_mlp": 1.01884675, + "epoch": 0.3728732038718211, + "flos": 25116749043840.0, + "grad_norm": 1.9898948264863654, + "language_loss": 0.82810473, + "learning_rate": 2.8885632519404704e-06, + "loss": 0.84995031, + "num_input_tokens_seen": 66638175, + "step": 3101, + "time_per_iteration": 2.5411581993103027 + }, + { + "auxiliary_loss_clip": 0.01161053, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.0559783, + "balance_loss_mlp": 1.02005136, + "epoch": 0.37299344676246016, + "flos": 25302048330240.0, + "grad_norm": 1.996871536090944, + "language_loss": 0.75514901, + "learning_rate": 2.8878653138496107e-06, + "loss": 0.77704489, + "num_input_tokens_seen": 66658670, + "step": 3102, + "time_per_iteration": 2.539137601852417 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.04293156, + "balance_loss_mlp": 1.02227378, + "epoch": 0.37311368965309927, + "flos": 23842531002240.0, + "grad_norm": 2.252413582131106, + "language_loss": 0.76544374, + "learning_rate": 2.8871672410702878e-06, + "loss": 0.78688169, + "num_input_tokens_seen": 66676030, + "step": 3103, + "time_per_iteration": 2.6596555709838867 + }, + { + "auxiliary_loss_clip": 0.01154628, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.05094266, + "balance_loss_mlp": 1.02044511, + "epoch": 0.3732339325437384, + "flos": 25811943845760.0, + "grad_norm": 1.8148055266422836, + "language_loss": 0.82128298, + "learning_rate": 2.8864690337084008e-06, + "loss": 0.84312606, + "num_input_tokens_seen": 66695305, + "step": 3104, + "time_per_iteration": 2.5743672847747803 + }, + { + "auxiliary_loss_clip": 0.01169001, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.05298913, + "balance_loss_mlp": 1.02213144, + "epoch": 0.37335417543437743, + "flos": 26208433146240.0, + "grad_norm": 1.6933106138856056, + "language_loss": 0.78106463, + "learning_rate": 2.885770691869866e-06, + "loss": 0.80306888, + "num_input_tokens_seen": 66716185, + "step": 3105, + "time_per_iteration": 2.5092620849609375 + }, + { + "auxiliary_loss_clip": 0.01170532, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.05424166, + "balance_loss_mlp": 1.02455795, + "epoch": 0.37347441832501654, + "flos": 24023879792640.0, + "grad_norm": 2.4903035712098958, + "language_loss": 0.7450949, + "learning_rate": 2.8850722156606207e-06, + "loss": 0.76712465, + "num_input_tokens_seen": 66734575, + "step": 3106, + "time_per_iteration": 2.496767044067383 + }, + { + "auxiliary_loss_clip": 0.01167309, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.05325627, + "balance_loss_mlp": 1.02698195, + "epoch": 0.3735946612156556, + "flos": 19714922409600.0, + "grad_norm": 1.5434421292237188, + "language_loss": 0.66832906, + "learning_rate": 2.8843736051866252e-06, + "loss": 0.69035399, + "num_input_tokens_seen": 66753500, + "step": 3107, + "time_per_iteration": 2.4811577796936035 + }, + { + "auxiliary_loss_clip": 0.01127216, + "auxiliary_loss_mlp": 0.00763304, + "balance_loss_clip": 1.04810548, + "balance_loss_mlp": 1.00075698, + "epoch": 0.3737149041062947, + "flos": 23039604334080.0, + "grad_norm": 1.698243829242408, + "language_loss": 0.69489014, + "learning_rate": 2.8836748605538557e-06, + "loss": 0.7137953, + "num_input_tokens_seen": 66775140, + "step": 3108, + "time_per_iteration": 2.6121256351470947 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.05248666, + "balance_loss_mlp": 1.01650369, + "epoch": 0.3738351469969338, + "flos": 34678108483200.0, + "grad_norm": 3.0445117398238133, + "language_loss": 0.63462484, + "learning_rate": 2.882975981868313e-06, + "loss": 0.65652591, + "num_input_tokens_seen": 66795525, + "step": 3109, + "time_per_iteration": 3.6112146377563477 + }, + { + "auxiliary_loss_clip": 0.01173385, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.05592632, + "balance_loss_mlp": 1.01895547, + "epoch": 0.3739553898875729, + "flos": 43507967448960.0, + "grad_norm": 2.392857825358932, + "language_loss": 0.68686962, + "learning_rate": 2.882276969236016e-06, + "loss": 0.70887834, + "num_input_tokens_seen": 66816885, + "step": 3110, + "time_per_iteration": 2.663795232772827 + }, + { + "auxiliary_loss_clip": 0.01156132, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.05128145, + "balance_loss_mlp": 1.0205245, + "epoch": 0.374075632778212, + "flos": 12856487448960.0, + "grad_norm": 2.046303823997311, + "language_loss": 0.76305211, + "learning_rate": 2.881577822763005e-06, + "loss": 0.78490615, + "num_input_tokens_seen": 66834835, + "step": 3111, + "time_per_iteration": 2.464536190032959 + }, + { + "auxiliary_loss_clip": 0.01171917, + "auxiliary_loss_mlp": 0.01024302, + "balance_loss_clip": 1.05332112, + "balance_loss_mlp": 1.01644015, + "epoch": 0.3741958756688511, + "flos": 26024031699840.0, + "grad_norm": 1.7865842072161702, + "language_loss": 0.87259078, + "learning_rate": 2.880878542555338e-06, + "loss": 0.89455301, + "num_input_tokens_seen": 66852600, + "step": 3112, + "time_per_iteration": 2.4953858852386475 + }, + { + "auxiliary_loss_clip": 0.01189556, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.05648804, + "balance_loss_mlp": 1.02138758, + "epoch": 0.37431611855949015, + "flos": 21433894652160.0, + "grad_norm": 2.258291429189346, + "language_loss": 0.80836713, + "learning_rate": 2.8801791287190976e-06, + "loss": 0.83056241, + "num_input_tokens_seen": 66870595, + "step": 3113, + "time_per_iteration": 2.4246292114257812 + }, + { + "auxiliary_loss_clip": 0.01173967, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.05243027, + "balance_loss_mlp": 1.01953852, + "epoch": 0.37443636145012926, + "flos": 24207096090240.0, + "grad_norm": 2.8971806978876233, + "language_loss": 0.8596946, + "learning_rate": 2.8794795813603817e-06, + "loss": 0.88171387, + "num_input_tokens_seen": 66886060, + "step": 3114, + "time_per_iteration": 3.3249690532684326 + }, + { + "auxiliary_loss_clip": 0.01178232, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.0536319, + "balance_loss_mlp": 1.02303791, + "epoch": 0.3745566043407684, + "flos": 15378601841280.0, + "grad_norm": 1.8865316440154212, + "language_loss": 0.81706363, + "learning_rate": 2.878779900585314e-06, + "loss": 0.83916384, + "num_input_tokens_seen": 66903900, + "step": 3115, + "time_per_iteration": 3.2313897609710693 + }, + { + "auxiliary_loss_clip": 0.01163875, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.0530107, + "balance_loss_mlp": 1.02072275, + "epoch": 0.37467684723140743, + "flos": 24608218245120.0, + "grad_norm": 1.5205094399668995, + "language_loss": 0.75291908, + "learning_rate": 2.8780800865000336e-06, + "loss": 0.7748493, + "num_input_tokens_seen": 66925210, + "step": 3116, + "time_per_iteration": 2.549765110015869 + }, + { + "auxiliary_loss_clip": 0.01076843, + "auxiliary_loss_mlp": 0.01006435, + "balance_loss_clip": 1.01985288, + "balance_loss_mlp": 1.00526702, + "epoch": 0.37479709012204654, + "flos": 64377491610240.0, + "grad_norm": 0.975965565243997, + "language_loss": 0.59196055, + "learning_rate": 2.877380139210702e-06, + "loss": 0.61279333, + "num_input_tokens_seen": 66983880, + "step": 3117, + "time_per_iteration": 3.0146188735961914 + }, + { + "auxiliary_loss_clip": 0.01146186, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.05080748, + "balance_loss_mlp": 1.02374387, + "epoch": 0.37491733301268565, + "flos": 23803962773760.0, + "grad_norm": 2.1955290934198333, + "language_loss": 0.76614767, + "learning_rate": 2.876680058823501e-06, + "loss": 0.78793955, + "num_input_tokens_seen": 67004280, + "step": 3118, + "time_per_iteration": 2.557805299758911 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.04963684, + "balance_loss_mlp": 1.02151227, + "epoch": 0.3750375759033247, + "flos": 32160950167680.0, + "grad_norm": 1.6945757924703555, + "language_loss": 0.6601609, + "learning_rate": 2.8759798454446314e-06, + "loss": 0.68194646, + "num_input_tokens_seen": 67027445, + "step": 3119, + "time_per_iteration": 2.588473081588745 + }, + { + "auxiliary_loss_clip": 0.0117607, + "auxiliary_loss_mlp": 0.01038566, + "balance_loss_clip": 1.0547415, + "balance_loss_mlp": 1.0303998, + "epoch": 0.3751578187939638, + "flos": 23367791923200.0, + "grad_norm": 2.007764817957837, + "language_loss": 0.81639218, + "learning_rate": 2.8752794991803173e-06, + "loss": 0.83853859, + "num_input_tokens_seen": 67045130, + "step": 3120, + "time_per_iteration": 2.4711804389953613 + }, + { + "auxiliary_loss_clip": 0.01156865, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.05206013, + "balance_loss_mlp": 1.02383029, + "epoch": 0.37527806168460287, + "flos": 14605731878400.0, + "grad_norm": 2.614544852039984, + "language_loss": 0.75237489, + "learning_rate": 2.8745790201367976e-06, + "loss": 0.77426612, + "num_input_tokens_seen": 67060885, + "step": 3121, + "time_per_iteration": 2.4698715209960938 + }, + { + "auxiliary_loss_clip": 0.01190986, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.05686259, + "balance_loss_mlp": 1.02804184, + "epoch": 0.375398304575242, + "flos": 26390823431040.0, + "grad_norm": 3.286901787291444, + "language_loss": 0.84436589, + "learning_rate": 2.8738784084203373e-06, + "loss": 0.86664438, + "num_input_tokens_seen": 67080960, + "step": 3122, + "time_per_iteration": 2.469667673110962 + }, + { + "auxiliary_loss_clip": 0.01149316, + "auxiliary_loss_mlp": 0.01026991, + "balance_loss_clip": 1.04714763, + "balance_loss_mlp": 1.01893294, + "epoch": 0.3755185474658811, + "flos": 22236605838720.0, + "grad_norm": 1.6258532256173992, + "language_loss": 0.78889155, + "learning_rate": 2.873177664137216e-06, + "loss": 0.81065464, + "num_input_tokens_seen": 67101890, + "step": 3123, + "time_per_iteration": 2.5259437561035156 + }, + { + "auxiliary_loss_clip": 0.01138532, + "auxiliary_loss_mlp": 0.01024502, + "balance_loss_clip": 1.05095363, + "balance_loss_mlp": 1.01613986, + "epoch": 0.37563879035652015, + "flos": 30812935633920.0, + "grad_norm": 1.700286020426204, + "language_loss": 0.69339776, + "learning_rate": 2.8724767873937384e-06, + "loss": 0.71502805, + "num_input_tokens_seen": 67126010, + "step": 3124, + "time_per_iteration": 2.6263017654418945 + }, + { + "auxiliary_loss_clip": 0.01158443, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.0518086, + "balance_loss_mlp": 1.02633464, + "epoch": 0.37575903324715926, + "flos": 20773533064320.0, + "grad_norm": 2.5622354056585364, + "language_loss": 0.87532628, + "learning_rate": 2.871775778296225e-06, + "loss": 0.89725614, + "num_input_tokens_seen": 67143100, + "step": 3125, + "time_per_iteration": 2.4712038040161133 + }, + { + "auxiliary_loss_clip": 0.01176876, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.05744195, + "balance_loss_mlp": 1.02479053, + "epoch": 0.37587927613779837, + "flos": 18697681244160.0, + "grad_norm": 1.9549841962074688, + "language_loss": 0.78357667, + "learning_rate": 2.8710746369510196e-06, + "loss": 0.8056882, + "num_input_tokens_seen": 67161085, + "step": 3126, + "time_per_iteration": 2.443674325942993 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.05298221, + "balance_loss_mlp": 1.02138352, + "epoch": 0.3759995190284374, + "flos": 13624796384640.0, + "grad_norm": 2.392028777529309, + "language_loss": 0.83229095, + "learning_rate": 2.8703733634644846e-06, + "loss": 0.8541187, + "num_input_tokens_seen": 67175840, + "step": 3127, + "time_per_iteration": 2.449815273284912 + }, + { + "auxiliary_loss_clip": 0.01184592, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.05540895, + "balance_loss_mlp": 1.02022266, + "epoch": 0.37611976191907653, + "flos": 20484847457280.0, + "grad_norm": 1.765232265266723, + "language_loss": 0.79220593, + "learning_rate": 2.869671957943002e-06, + "loss": 0.8143369, + "num_input_tokens_seen": 67194995, + "step": 3128, + "time_per_iteration": 2.4282548427581787 + }, + { + "auxiliary_loss_clip": 0.0115593, + "auxiliary_loss_mlp": 0.01028357, + "balance_loss_clip": 1.05754912, + "balance_loss_mlp": 1.01995301, + "epoch": 0.37624000480971564, + "flos": 21141797253120.0, + "grad_norm": 1.7464556906682398, + "language_loss": 0.73718297, + "learning_rate": 2.8689704204929747e-06, + "loss": 0.75902581, + "num_input_tokens_seen": 67214175, + "step": 3129, + "time_per_iteration": 2.496959924697876 + }, + { + "auxiliary_loss_clip": 0.01186897, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.05572617, + "balance_loss_mlp": 1.02212489, + "epoch": 0.3763602477003547, + "flos": 22564470205440.0, + "grad_norm": 1.723356042838573, + "language_loss": 0.81053418, + "learning_rate": 2.8682687512208253e-06, + "loss": 0.83270943, + "num_input_tokens_seen": 67233185, + "step": 3130, + "time_per_iteration": 2.434842348098755 + }, + { + "auxiliary_loss_clip": 0.01179706, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.05539799, + "balance_loss_mlp": 1.02483177, + "epoch": 0.3764804905909938, + "flos": 27526857851520.0, + "grad_norm": 2.198284008165177, + "language_loss": 0.80208343, + "learning_rate": 2.8675669502329972e-06, + "loss": 0.82421649, + "num_input_tokens_seen": 67254715, + "step": 3131, + "time_per_iteration": 2.5020923614501953 + }, + { + "auxiliary_loss_clip": 0.01175319, + "auxiliary_loss_mlp": 0.00763399, + "balance_loss_clip": 1.0556109, + "balance_loss_mlp": 1.00060058, + "epoch": 0.3766007334816329, + "flos": 22528092706560.0, + "grad_norm": 2.4549663549152756, + "language_loss": 0.85911304, + "learning_rate": 2.866865017635952e-06, + "loss": 0.87850022, + "num_input_tokens_seen": 67272535, + "step": 3132, + "time_per_iteration": 2.4791293144226074 + }, + { + "auxiliary_loss_clip": 0.01144208, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.05422175, + "balance_loss_mlp": 1.01894581, + "epoch": 0.376720976372272, + "flos": 25957166532480.0, + "grad_norm": 1.5446627872810743, + "language_loss": 0.79567385, + "learning_rate": 2.866162953536174e-06, + "loss": 0.81739175, + "num_input_tokens_seen": 67293505, + "step": 3133, + "time_per_iteration": 2.5564582347869873 + }, + { + "auxiliary_loss_clip": 0.01156655, + "auxiliary_loss_mlp": 0.00763008, + "balance_loss_clip": 1.05179453, + "balance_loss_mlp": 1.00053525, + "epoch": 0.3768412192629111, + "flos": 18041162411520.0, + "grad_norm": 1.6167623823912223, + "language_loss": 0.75061142, + "learning_rate": 2.8654607580401634e-06, + "loss": 0.76980805, + "num_input_tokens_seen": 67313240, + "step": 3134, + "time_per_iteration": 2.5095746517181396 + }, + { + "auxiliary_loss_clip": 0.01074758, + "auxiliary_loss_mlp": 0.01001733, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.00059497, + "epoch": 0.3769614621535502, + "flos": 62989472304000.0, + "grad_norm": 0.8811534456221024, + "language_loss": 0.65200293, + "learning_rate": 2.8647584312544446e-06, + "loss": 0.67276788, + "num_input_tokens_seen": 67378445, + "step": 3135, + "time_per_iteration": 3.070974588394165 + }, + { + "auxiliary_loss_clip": 0.01138074, + "auxiliary_loss_mlp": 0.00763079, + "balance_loss_clip": 1.04827738, + "balance_loss_mlp": 1.00057137, + "epoch": 0.37708170504418925, + "flos": 23661685002240.0, + "grad_norm": 1.3930429172460972, + "language_loss": 0.85039067, + "learning_rate": 2.864055973285559e-06, + "loss": 0.86940217, + "num_input_tokens_seen": 67400445, + "step": 3136, + "time_per_iteration": 3.4620230197906494 + }, + { + "auxiliary_loss_clip": 0.01148162, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.05047369, + "balance_loss_mlp": 1.02497792, + "epoch": 0.37720194793482836, + "flos": 24423170353920.0, + "grad_norm": 1.7110220162382623, + "language_loss": 0.8608647, + "learning_rate": 2.8633533842400698e-06, + "loss": 0.88268363, + "num_input_tokens_seen": 67420645, + "step": 3137, + "time_per_iteration": 2.5413384437561035 + }, + { + "auxiliary_loss_clip": 0.01172878, + "auxiliary_loss_mlp": 0.00763479, + "balance_loss_clip": 1.05405211, + "balance_loss_mlp": 1.00065172, + "epoch": 0.3773221908254674, + "flos": 20996502739200.0, + "grad_norm": 1.8067888065358066, + "language_loss": 0.77202499, + "learning_rate": 2.862650664224558e-06, + "loss": 0.79138851, + "num_input_tokens_seen": 67439495, + "step": 3138, + "time_per_iteration": 2.474343776702881 + }, + { + "auxiliary_loss_clip": 0.01171471, + "auxiliary_loss_mlp": 0.01025909, + "balance_loss_clip": 1.05762529, + "balance_loss_mlp": 1.01851761, + "epoch": 0.37744243371610653, + "flos": 37631724958080.0, + "grad_norm": 1.400180293354287, + "language_loss": 0.6971063, + "learning_rate": 2.861947813345627e-06, + "loss": 0.71908009, + "num_input_tokens_seen": 67462195, + "step": 3139, + "time_per_iteration": 2.6568715572357178 + }, + { + "auxiliary_loss_clip": 0.0119042, + "auxiliary_loss_mlp": 0.00763213, + "balance_loss_clip": 1.05854702, + "balance_loss_mlp": 1.000525, + "epoch": 0.37756267660674564, + "flos": 26140526484480.0, + "grad_norm": 2.9436515015122033, + "language_loss": 0.72500849, + "learning_rate": 2.8612448317098974e-06, + "loss": 0.74454486, + "num_input_tokens_seen": 67482530, + "step": 3140, + "time_per_iteration": 3.3453752994537354 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.00763002, + "balance_loss_clip": 1.05067492, + "balance_loss_mlp": 1.00060177, + "epoch": 0.3776829194973847, + "flos": 19427888828160.0, + "grad_norm": 2.1740787755318682, + "language_loss": 0.83223599, + "learning_rate": 2.8605417194240114e-06, + "loss": 0.85133713, + "num_input_tokens_seen": 67500890, + "step": 3141, + "time_per_iteration": 2.5429534912109375 + }, + { + "auxiliary_loss_clip": 0.01164911, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.05115652, + "balance_loss_mlp": 1.01951098, + "epoch": 0.3778031623880238, + "flos": 17382309194880.0, + "grad_norm": 2.9788640345191495, + "language_loss": 0.78936994, + "learning_rate": 2.8598384765946315e-06, + "loss": 0.81129164, + "num_input_tokens_seen": 67519545, + "step": 3142, + "time_per_iteration": 3.2364134788513184 + }, + { + "auxiliary_loss_clip": 0.01185699, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.05346358, + "balance_loss_mlp": 1.02037144, + "epoch": 0.3779234052786629, + "flos": 27125843437440.0, + "grad_norm": 1.8769201854494677, + "language_loss": 0.7180388, + "learning_rate": 2.8591351033284377e-06, + "loss": 0.74018073, + "num_input_tokens_seen": 67539275, + "step": 3143, + "time_per_iteration": 3.2351582050323486 + }, + { + "auxiliary_loss_clip": 0.011739, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.05205965, + "balance_loss_mlp": 1.01873887, + "epoch": 0.37804364816930197, + "flos": 19682639061120.0, + "grad_norm": 2.015355793944523, + "language_loss": 0.83983552, + "learning_rate": 2.8584315997321325e-06, + "loss": 0.86184108, + "num_input_tokens_seen": 67558280, + "step": 3144, + "time_per_iteration": 2.50832462310791 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.05478227, + "balance_loss_mlp": 1.01867449, + "epoch": 0.3781638910599411, + "flos": 22702905221760.0, + "grad_norm": 4.6456889317445595, + "language_loss": 0.780828, + "learning_rate": 2.8577279659124356e-06, + "loss": 0.8029598, + "num_input_tokens_seen": 67575955, + "step": 3145, + "time_per_iteration": 2.43192458152771 + }, + { + "auxiliary_loss_clip": 0.0116686, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.0515486, + "balance_loss_mlp": 1.01945472, + "epoch": 0.3782841339505802, + "flos": 14647604158080.0, + "grad_norm": 1.746516935686437, + "language_loss": 0.83336186, + "learning_rate": 2.857024201976089e-06, + "loss": 0.85529482, + "num_input_tokens_seen": 67593515, + "step": 3146, + "time_per_iteration": 2.4630327224731445 + }, + { + "auxiliary_loss_clip": 0.01155457, + "auxiliary_loss_mlp": 0.01026075, + "balance_loss_clip": 1.05408263, + "balance_loss_mlp": 1.01752472, + "epoch": 0.37840437684121925, + "flos": 32818223185920.0, + "grad_norm": 1.8728539770349364, + "language_loss": 0.73618233, + "learning_rate": 2.8563203080298516e-06, + "loss": 0.75799763, + "num_input_tokens_seen": 67614290, + "step": 3147, + "time_per_iteration": 2.5908761024475098 + }, + { + "auxiliary_loss_clip": 0.01157094, + "auxiliary_loss_mlp": 0.00763172, + "balance_loss_clip": 1.05285096, + "balance_loss_mlp": 1.00055099, + "epoch": 0.37852461973185836, + "flos": 18369206346240.0, + "grad_norm": 2.079168619004708, + "language_loss": 0.89210075, + "learning_rate": 2.855616284180505e-06, + "loss": 0.9113034, + "num_input_tokens_seen": 67631340, + "step": 3148, + "time_per_iteration": 2.492553472518921 + }, + { + "auxiliary_loss_clip": 0.01077776, + "auxiliary_loss_mlp": 0.01002141, + "balance_loss_clip": 1.01902878, + "balance_loss_mlp": 1.0008595, + "epoch": 0.37864486262249747, + "flos": 59500680117120.0, + "grad_norm": 0.8727031367201991, + "language_loss": 0.66163015, + "learning_rate": 2.8549121305348477e-06, + "loss": 0.68242937, + "num_input_tokens_seen": 67691125, + "step": 3149, + "time_per_iteration": 3.014672040939331 + }, + { + "auxiliary_loss_clip": 0.01171632, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.05333805, + "balance_loss_mlp": 1.02304149, + "epoch": 0.3787651055131365, + "flos": 23363015414400.0, + "grad_norm": 2.0741758902264285, + "language_loss": 0.83218241, + "learning_rate": 2.8542078471997006e-06, + "loss": 0.85420108, + "num_input_tokens_seen": 67708740, + "step": 3150, + "time_per_iteration": 2.4627511501312256 + }, + { + "auxiliary_loss_clip": 0.01170095, + "auxiliary_loss_mlp": 0.01024641, + "balance_loss_clip": 1.05196786, + "balance_loss_mlp": 1.01736963, + "epoch": 0.37888534840377563, + "flos": 24601394661120.0, + "grad_norm": 1.6798829687995607, + "language_loss": 0.75739157, + "learning_rate": 2.8535034342819013e-06, + "loss": 0.77933896, + "num_input_tokens_seen": 67726150, + "step": 3151, + "time_per_iteration": 2.4999353885650635 + }, + { + "auxiliary_loss_clip": 0.01180723, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.0526576, + "balance_loss_mlp": 1.0228374, + "epoch": 0.37900559129441475, + "flos": 23986891762560.0, + "grad_norm": 1.8985002836540235, + "language_loss": 0.72623253, + "learning_rate": 2.85279889188831e-06, + "loss": 0.74834985, + "num_input_tokens_seen": 67746525, + "step": 3152, + "time_per_iteration": 2.4679253101348877 + }, + { + "auxiliary_loss_clip": 0.01140687, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.04550624, + "balance_loss_mlp": 1.01670969, + "epoch": 0.3791258341850538, + "flos": 24644667571200.0, + "grad_norm": 1.8830747005249437, + "language_loss": 0.81117892, + "learning_rate": 2.852094220125805e-06, + "loss": 0.83284134, + "num_input_tokens_seen": 67766035, + "step": 3153, + "time_per_iteration": 2.553452968597412 + }, + { + "auxiliary_loss_clip": 0.01172363, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.05350137, + "balance_loss_mlp": 1.02798831, + "epoch": 0.3792460770756929, + "flos": 17420841509760.0, + "grad_norm": 2.011030628955631, + "language_loss": 0.71175748, + "learning_rate": 2.8513894191012846e-06, + "loss": 0.73384488, + "num_input_tokens_seen": 67785015, + "step": 3154, + "time_per_iteration": 2.4432120323181152 + }, + { + "auxiliary_loss_clip": 0.01185751, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.05495906, + "balance_loss_mlp": 1.02147841, + "epoch": 0.37936631996633197, + "flos": 24206557386240.0, + "grad_norm": 1.4748569945703571, + "language_loss": 0.78986973, + "learning_rate": 2.8506844889216664e-06, + "loss": 0.8120259, + "num_input_tokens_seen": 67804400, + "step": 3155, + "time_per_iteration": 2.451220989227295 + }, + { + "auxiliary_loss_clip": 0.0107217, + "auxiliary_loss_mlp": 0.01003007, + "balance_loss_clip": 1.01990426, + "balance_loss_mlp": 1.00193441, + "epoch": 0.3794865628569711, + "flos": 70297114752000.0, + "grad_norm": 0.8607819496203047, + "language_loss": 0.62838602, + "learning_rate": 2.849979429693887e-06, + "loss": 0.64913774, + "num_input_tokens_seen": 67865385, + "step": 3156, + "time_per_iteration": 3.119034767150879 + }, + { + "auxiliary_loss_clip": 0.01181803, + "auxiliary_loss_mlp": 0.01027025, + "balance_loss_clip": 1.05283451, + "balance_loss_mlp": 1.01924086, + "epoch": 0.3796068057476102, + "flos": 15779364860160.0, + "grad_norm": 2.0084870282797436, + "language_loss": 0.74210215, + "learning_rate": 2.8492742415249042e-06, + "loss": 0.76419044, + "num_input_tokens_seen": 67883030, + "step": 3157, + "time_per_iteration": 2.5124590396881104 + }, + { + "auxiliary_loss_clip": 0.01181307, + "auxiliary_loss_mlp": 0.01024898, + "balance_loss_clip": 1.05115533, + "balance_loss_mlp": 1.01722717, + "epoch": 0.37972704863824924, + "flos": 25191694771200.0, + "grad_norm": 1.6691102650817446, + "language_loss": 0.76180249, + "learning_rate": 2.848568924521694e-06, + "loss": 0.78386456, + "num_input_tokens_seen": 67903810, + "step": 3158, + "time_per_iteration": 2.460641860961914 + }, + { + "auxiliary_loss_clip": 0.01161962, + "auxiliary_loss_mlp": 0.01024617, + "balance_loss_clip": 1.04823327, + "balance_loss_mlp": 1.01620042, + "epoch": 0.37984729152888835, + "flos": 26210372480640.0, + "grad_norm": 1.7492059876551591, + "language_loss": 0.73334646, + "learning_rate": 2.8478634787912526e-06, + "loss": 0.75521225, + "num_input_tokens_seen": 67921865, + "step": 3159, + "time_per_iteration": 2.494974136352539 + }, + { + "auxiliary_loss_clip": 0.01170244, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.05121291, + "balance_loss_mlp": 1.02118349, + "epoch": 0.37996753441952746, + "flos": 25629302165760.0, + "grad_norm": 1.9521277796201606, + "language_loss": 0.76368785, + "learning_rate": 2.847157904440596e-06, + "loss": 0.78568292, + "num_input_tokens_seen": 67941595, + "step": 3160, + "time_per_iteration": 2.488924503326416 + }, + { + "auxiliary_loss_clip": 0.01168733, + "auxiliary_loss_mlp": 0.01028002, + "balance_loss_clip": 1.05106163, + "balance_loss_mlp": 1.0202713, + "epoch": 0.3800877773101665, + "flos": 20118414862080.0, + "grad_norm": 1.5681746370415068, + "language_loss": 0.73709714, + "learning_rate": 2.846452201576759e-06, + "loss": 0.75906456, + "num_input_tokens_seen": 67960970, + "step": 3161, + "time_per_iteration": 2.504160165786743 + }, + { + "auxiliary_loss_clip": 0.01066212, + "auxiliary_loss_mlp": 0.01001792, + "balance_loss_clip": 1.01723588, + "balance_loss_mlp": 1.00053406, + "epoch": 0.38020802020080563, + "flos": 63053608037760.0, + "grad_norm": 0.8522443023093426, + "language_loss": 0.62794399, + "learning_rate": 2.845746370306795e-06, + "loss": 0.64862406, + "num_input_tokens_seen": 68026160, + "step": 3162, + "time_per_iteration": 3.1608963012695312 + }, + { + "auxiliary_loss_clip": 0.01170603, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.05271387, + "balance_loss_mlp": 1.02056789, + "epoch": 0.38032826309144474, + "flos": 21288420570240.0, + "grad_norm": 2.0289624462301235, + "language_loss": 0.78352225, + "learning_rate": 2.84504041073778e-06, + "loss": 0.80551469, + "num_input_tokens_seen": 68044575, + "step": 3163, + "time_per_iteration": 3.2978944778442383 + }, + { + "auxiliary_loss_clip": 0.01149744, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.05061769, + "balance_loss_mlp": 1.02728987, + "epoch": 0.3804485059820838, + "flos": 18954119416320.0, + "grad_norm": 1.68518794615801, + "language_loss": 0.78961897, + "learning_rate": 2.844334322976806e-06, + "loss": 0.81147581, + "num_input_tokens_seen": 68064790, + "step": 3164, + "time_per_iteration": 2.4990808963775635 + }, + { + "auxiliary_loss_clip": 0.01129429, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.04753578, + "balance_loss_mlp": 1.02714658, + "epoch": 0.3805687488727229, + "flos": 21833759831040.0, + "grad_norm": 1.7590650241902899, + "language_loss": 0.8353042, + "learning_rate": 2.8436281071309866e-06, + "loss": 0.85695261, + "num_input_tokens_seen": 68083330, + "step": 3165, + "time_per_iteration": 2.578946352005005 + }, + { + "auxiliary_loss_clip": 0.01043514, + "auxiliary_loss_mlp": 0.01003189, + "balance_loss_clip": 1.01465905, + "balance_loss_mlp": 1.00191975, + "epoch": 0.380688991763362, + "flos": 58546209968640.0, + "grad_norm": 0.7280922055584209, + "language_loss": 0.5307098, + "learning_rate": 2.842921763307455e-06, + "loss": 0.55117679, + "num_input_tokens_seen": 68146140, + "step": 3166, + "time_per_iteration": 3.1237878799438477 + }, + { + "auxiliary_loss_clip": 0.01147546, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.048563, + "balance_loss_mlp": 1.01918101, + "epoch": 0.38080923465400107, + "flos": 23799509487360.0, + "grad_norm": 1.76003533843277, + "language_loss": 0.82244015, + "learning_rate": 2.842215291613361e-06, + "loss": 0.84418237, + "num_input_tokens_seen": 68164520, + "step": 3167, + "time_per_iteration": 3.350708484649658 + }, + { + "auxiliary_loss_clip": 0.01008367, + "auxiliary_loss_mlp": 0.01000715, + "balance_loss_clip": 1.01497889, + "balance_loss_mlp": 0.99937433, + "epoch": 0.3809294775446402, + "flos": 54969866380800.0, + "grad_norm": 0.8370077518291396, + "language_loss": 0.59312236, + "learning_rate": 2.8415086921558774e-06, + "loss": 0.61321318, + "num_input_tokens_seen": 68227945, + "step": 3168, + "time_per_iteration": 3.525718927383423 + }, + { + "auxiliary_loss_clip": 0.01137842, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.04266143, + "balance_loss_mlp": 1.01776886, + "epoch": 0.38104972043527924, + "flos": 24643697904000.0, + "grad_norm": 1.6110889269009732, + "language_loss": 0.78616571, + "learning_rate": 2.840801965042194e-06, + "loss": 0.80779564, + "num_input_tokens_seen": 68247405, + "step": 3169, + "time_per_iteration": 4.051271438598633 + }, + { + "auxiliary_loss_clip": 0.01145332, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.04595494, + "balance_loss_mlp": 1.01638222, + "epoch": 0.38116996332591835, + "flos": 22856783086080.0, + "grad_norm": 1.791710454339517, + "language_loss": 0.83780932, + "learning_rate": 2.840095110379521e-06, + "loss": 0.85951465, + "num_input_tokens_seen": 68266925, + "step": 3170, + "time_per_iteration": 2.502591848373413 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01001923, + "balance_loss_clip": 1.01768005, + "balance_loss_mlp": 1.00064707, + "epoch": 0.38129020621655746, + "flos": 60836160804480.0, + "grad_norm": 0.7370650773994835, + "language_loss": 0.53917062, + "learning_rate": 2.8393881282750884e-06, + "loss": 0.55958009, + "num_input_tokens_seen": 68329755, + "step": 3171, + "time_per_iteration": 3.2006514072418213 + }, + { + "auxiliary_loss_clip": 0.01152919, + "auxiliary_loss_mlp": 0.01026307, + "balance_loss_clip": 1.05122185, + "balance_loss_mlp": 1.01796198, + "epoch": 0.3814104491071965, + "flos": 21648101408640.0, + "grad_norm": 2.1167779888947877, + "language_loss": 0.78833842, + "learning_rate": 2.838681018836144e-06, + "loss": 0.8101306, + "num_input_tokens_seen": 68347075, + "step": 3172, + "time_per_iteration": 2.709188938140869 + }, + { + "auxiliary_loss_clip": 0.01140165, + "auxiliary_loss_mlp": 0.00762271, + "balance_loss_clip": 1.04590583, + "balance_loss_mlp": 1.00048661, + "epoch": 0.3815306919978356, + "flos": 19099090707840.0, + "grad_norm": 3.1572789620936867, + "language_loss": 0.78345037, + "learning_rate": 2.837973782169955e-06, + "loss": 0.80247474, + "num_input_tokens_seen": 68365450, + "step": 3173, + "time_per_iteration": 2.5054147243499756 + }, + { + "auxiliary_loss_clip": 0.01083746, + "auxiliary_loss_mlp": 0.01003965, + "balance_loss_clip": 1.01695681, + "balance_loss_mlp": 1.00276124, + "epoch": 0.38165093488847474, + "flos": 67067918156160.0, + "grad_norm": 0.8089844439337767, + "language_loss": 0.59189057, + "learning_rate": 2.8372664183838096e-06, + "loss": 0.6127677, + "num_input_tokens_seen": 68428470, + "step": 3174, + "time_per_iteration": 3.059325695037842 + }, + { + "auxiliary_loss_clip": 0.01181734, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.05307579, + "balance_loss_mlp": 1.02084374, + "epoch": 0.3817711777791138, + "flos": 22341105480960.0, + "grad_norm": 2.2264495596923406, + "language_loss": 0.68303359, + "learning_rate": 2.836558927585015e-06, + "loss": 0.70513928, + "num_input_tokens_seen": 68445440, + "step": 3175, + "time_per_iteration": 2.4270942211151123 + }, + { + "auxiliary_loss_clip": 0.01171557, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.05329132, + "balance_loss_mlp": 1.02558327, + "epoch": 0.3818914206697529, + "flos": 22820621068800.0, + "grad_norm": 1.8640719254165987, + "language_loss": 0.82636273, + "learning_rate": 2.8358513098808957e-06, + "loss": 0.84840578, + "num_input_tokens_seen": 68465755, + "step": 3176, + "time_per_iteration": 2.477747917175293 + }, + { + "auxiliary_loss_clip": 0.01117358, + "auxiliary_loss_mlp": 0.010266, + "balance_loss_clip": 1.04605341, + "balance_loss_mlp": 1.01851141, + "epoch": 0.382011663560392, + "flos": 24386074583040.0, + "grad_norm": 1.7021520832148063, + "language_loss": 0.76432312, + "learning_rate": 2.835143565378798e-06, + "loss": 0.78576267, + "num_input_tokens_seen": 68486220, + "step": 3177, + "time_per_iteration": 2.5898873805999756 + }, + { + "auxiliary_loss_clip": 0.01109025, + "auxiliary_loss_mlp": 0.01021945, + "balance_loss_clip": 1.04546857, + "balance_loss_mlp": 1.01432776, + "epoch": 0.38213190645103107, + "flos": 21981568296960.0, + "grad_norm": 2.1506874675296968, + "language_loss": 0.78206319, + "learning_rate": 2.8344356941860847e-06, + "loss": 0.80337286, + "num_input_tokens_seen": 68505850, + "step": 3178, + "time_per_iteration": 2.603647232055664 + }, + { + "auxiliary_loss_clip": 0.01137521, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.05007863, + "balance_loss_mlp": 1.02045584, + "epoch": 0.3822521493416702, + "flos": 35516945773440.0, + "grad_norm": 2.206334388246954, + "language_loss": 0.66234505, + "learning_rate": 2.8337276964101403e-06, + "loss": 0.6840027, + "num_input_tokens_seen": 68526290, + "step": 3179, + "time_per_iteration": 2.696027994155884 + }, + { + "auxiliary_loss_clip": 0.01169485, + "auxiliary_loss_mlp": 0.01027547, + "balance_loss_clip": 1.05123329, + "balance_loss_mlp": 1.02000737, + "epoch": 0.3823723922323093, + "flos": 21069904181760.0, + "grad_norm": 4.254980769317907, + "language_loss": 0.7682572, + "learning_rate": 2.833019572158367e-06, + "loss": 0.79022747, + "num_input_tokens_seen": 68544725, + "step": 3180, + "time_per_iteration": 2.488067388534546 + }, + { + "auxiliary_loss_clip": 0.0115579, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.05235767, + "balance_loss_mlp": 1.0198189, + "epoch": 0.38249263512294834, + "flos": 19789149864960.0, + "grad_norm": 1.9127208185256865, + "language_loss": 0.79979289, + "learning_rate": 2.8323113215381872e-06, + "loss": 0.82162619, + "num_input_tokens_seen": 68563070, + "step": 3181, + "time_per_iteration": 2.5239553451538086 + }, + { + "auxiliary_loss_clip": 0.01135962, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.04739428, + "balance_loss_mlp": 1.02378821, + "epoch": 0.38261287801358745, + "flos": 21433930565760.0, + "grad_norm": 3.3638053866364523, + "language_loss": 0.76438922, + "learning_rate": 2.831602944657042e-06, + "loss": 0.78606719, + "num_input_tokens_seen": 68581150, + "step": 3182, + "time_per_iteration": 2.53416109085083 + }, + { + "auxiliary_loss_clip": 0.01161672, + "auxiliary_loss_mlp": 0.01022724, + "balance_loss_clip": 1.0515033, + "balance_loss_mlp": 1.01535058, + "epoch": 0.38273312090422656, + "flos": 21981568296960.0, + "grad_norm": 2.4542562068155216, + "language_loss": 0.7424916, + "learning_rate": 2.830894441622391e-06, + "loss": 0.76433551, + "num_input_tokens_seen": 68597800, + "step": 3183, + "time_per_iteration": 2.496521472930908 + }, + { + "auxiliary_loss_clip": 0.01137753, + "auxiliary_loss_mlp": 0.00762416, + "balance_loss_clip": 1.04529953, + "balance_loss_mlp": 1.00051236, + "epoch": 0.3828533637948656, + "flos": 24790895838720.0, + "grad_norm": 2.063231703597195, + "language_loss": 0.80283439, + "learning_rate": 2.8301858125417134e-06, + "loss": 0.82183611, + "num_input_tokens_seen": 68617640, + "step": 3184, + "time_per_iteration": 2.565103769302368 + }, + { + "auxiliary_loss_clip": 0.01155518, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.05390048, + "balance_loss_mlp": 1.01862717, + "epoch": 0.38297360668550473, + "flos": 22455445449600.0, + "grad_norm": 1.7793085337085333, + "language_loss": 0.73788238, + "learning_rate": 2.8294770575225082e-06, + "loss": 0.75969481, + "num_input_tokens_seen": 68637770, + "step": 3185, + "time_per_iteration": 2.5026817321777344 + }, + { + "auxiliary_loss_clip": 0.01171711, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.05589783, + "balance_loss_mlp": 1.02291048, + "epoch": 0.3830938495761438, + "flos": 24896903852160.0, + "grad_norm": 1.8995471365824692, + "language_loss": 0.84415257, + "learning_rate": 2.828768176672293e-06, + "loss": 0.86617702, + "num_input_tokens_seen": 68656885, + "step": 3186, + "time_per_iteration": 2.499512195587158 + }, + { + "auxiliary_loss_clip": 0.01137167, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.0474906, + "balance_loss_mlp": 1.01999009, + "epoch": 0.3832140924667829, + "flos": 33036236784000.0, + "grad_norm": 1.7281969888694617, + "language_loss": 0.71628308, + "learning_rate": 2.8280591700986044e-06, + "loss": 0.73793435, + "num_input_tokens_seen": 68678750, + "step": 3187, + "time_per_iteration": 2.6123199462890625 + }, + { + "auxiliary_loss_clip": 0.01156838, + "auxiliary_loss_mlp": 0.01030437, + "balance_loss_clip": 1.04954779, + "balance_loss_mlp": 1.02254581, + "epoch": 0.383334335357422, + "flos": 31903721896320.0, + "grad_norm": 2.1807006866721266, + "language_loss": 0.75257981, + "learning_rate": 2.827350037908999e-06, + "loss": 0.77445263, + "num_input_tokens_seen": 68698190, + "step": 3188, + "time_per_iteration": 2.575436592102051 + }, + { + "auxiliary_loss_clip": 0.01144218, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.04833341, + "balance_loss_mlp": 1.02313757, + "epoch": 0.38345457824806106, + "flos": 19791915212160.0, + "grad_norm": 2.4878051564816346, + "language_loss": 0.7945323, + "learning_rate": 2.8266407802110496e-06, + "loss": 0.81629032, + "num_input_tokens_seen": 68716445, + "step": 3189, + "time_per_iteration": 3.3202626705169678 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.01027768, + "balance_loss_clip": 1.04342628, + "balance_loss_mlp": 1.01929879, + "epoch": 0.3835748211387002, + "flos": 22419391173120.0, + "grad_norm": 1.6348695623562643, + "language_loss": 0.75552332, + "learning_rate": 2.8259313971123515e-06, + "loss": 0.77683049, + "num_input_tokens_seen": 68737565, + "step": 3190, + "time_per_iteration": 2.631824493408203 + }, + { + "auxiliary_loss_clip": 0.01165465, + "auxiliary_loss_mlp": 0.01026559, + "balance_loss_clip": 1.05310309, + "balance_loss_mlp": 1.01922727, + "epoch": 0.3836950640293393, + "flos": 25118436983040.0, + "grad_norm": 1.4647485065380128, + "language_loss": 0.78014088, + "learning_rate": 2.8252218887205166e-06, + "loss": 0.80206108, + "num_input_tokens_seen": 68758255, + "step": 3191, + "time_per_iteration": 2.5090203285217285 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.04734313, + "balance_loss_mlp": 1.02055228, + "epoch": 0.38381530691997834, + "flos": 21799213925760.0, + "grad_norm": 1.7546562748941634, + "language_loss": 0.8057124, + "learning_rate": 2.824512255143178e-06, + "loss": 0.82713151, + "num_input_tokens_seen": 68777490, + "step": 3192, + "time_per_iteration": 2.5692050457000732 + }, + { + "auxiliary_loss_clip": 0.011409, + "auxiliary_loss_mlp": 0.01022626, + "balance_loss_clip": 1.04771256, + "balance_loss_mlp": 1.01530337, + "epoch": 0.38393554981061745, + "flos": 21252689516160.0, + "grad_norm": 1.5942724656274099, + "language_loss": 0.79068297, + "learning_rate": 2.8238024964879855e-06, + "loss": 0.81231821, + "num_input_tokens_seen": 68798385, + "step": 3193, + "time_per_iteration": 3.372929811477661 + }, + { + "auxiliary_loss_clip": 0.01184641, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.05431449, + "balance_loss_mlp": 1.02031624, + "epoch": 0.38405579270125656, + "flos": 17019360218880.0, + "grad_norm": 2.2270292550574275, + "language_loss": 0.76967895, + "learning_rate": 2.8230926128626095e-06, + "loss": 0.79181457, + "num_input_tokens_seen": 68816880, + "step": 3194, + "time_per_iteration": 2.4135050773620605 + }, + { + "auxiliary_loss_clip": 0.01150216, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.04955673, + "balance_loss_mlp": 1.02208662, + "epoch": 0.3841760355918956, + "flos": 21835375943040.0, + "grad_norm": 1.9727650672589567, + "language_loss": 0.79329073, + "learning_rate": 2.822382604374738e-06, + "loss": 0.8150999, + "num_input_tokens_seen": 68835805, + "step": 3195, + "time_per_iteration": 2.4910168647766113 + }, + { + "auxiliary_loss_clip": 0.01155221, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.05323637, + "balance_loss_mlp": 1.02565098, + "epoch": 0.3842962784825347, + "flos": 25915114684800.0, + "grad_norm": 2.0532166436378527, + "language_loss": 0.65856123, + "learning_rate": 2.8216724711320793e-06, + "loss": 0.68045068, + "num_input_tokens_seen": 68854930, + "step": 3196, + "time_per_iteration": 3.3022334575653076 + }, + { + "auxiliary_loss_clip": 0.01181011, + "auxiliary_loss_mlp": 0.00762096, + "balance_loss_clip": 1.05351889, + "balance_loss_mlp": 1.0004859, + "epoch": 0.38441652137317384, + "flos": 25337492075520.0, + "grad_norm": 1.47877781050593, + "language_loss": 0.79610753, + "learning_rate": 2.820962213242361e-06, + "loss": 0.81553864, + "num_input_tokens_seen": 68874260, + "step": 3197, + "time_per_iteration": 2.4673094749450684 + }, + { + "auxiliary_loss_clip": 0.01165806, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.05499542, + "balance_loss_mlp": 1.02281749, + "epoch": 0.3845367642638129, + "flos": 18113486446080.0, + "grad_norm": 2.0839808971588023, + "language_loss": 0.84196889, + "learning_rate": 2.8202518308133264e-06, + "loss": 0.86393297, + "num_input_tokens_seen": 68891535, + "step": 3198, + "time_per_iteration": 2.4500091075897217 + }, + { + "auxiliary_loss_clip": 0.01183405, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.05307496, + "balance_loss_mlp": 1.01988339, + "epoch": 0.384657007154452, + "flos": 25228395492480.0, + "grad_norm": 1.7723069316482476, + "language_loss": 0.73268163, + "learning_rate": 2.8195413239527426e-06, + "loss": 0.75479865, + "num_input_tokens_seen": 68911275, + "step": 3199, + "time_per_iteration": 2.4608614444732666 + }, + { + "auxiliary_loss_clip": 0.01165938, + "auxiliary_loss_mlp": 0.01025012, + "balance_loss_clip": 1.05086684, + "balance_loss_mlp": 1.01728714, + "epoch": 0.38477725004509106, + "flos": 19865855358720.0, + "grad_norm": 2.984715420168236, + "language_loss": 0.80649495, + "learning_rate": 2.8188306927683906e-06, + "loss": 0.82840443, + "num_input_tokens_seen": 68930745, + "step": 3200, + "time_per_iteration": 2.479733467102051 + }, + { + "auxiliary_loss_clip": 0.01157421, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.05219245, + "balance_loss_mlp": 1.01882255, + "epoch": 0.38489749293573017, + "flos": 18259391491200.0, + "grad_norm": 2.0538072666640517, + "language_loss": 0.74708247, + "learning_rate": 2.818119937368074e-06, + "loss": 0.76892054, + "num_input_tokens_seen": 68949380, + "step": 3201, + "time_per_iteration": 2.48122501373291 + }, + { + "auxiliary_loss_clip": 0.01174571, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.05240822, + "balance_loss_mlp": 1.01789594, + "epoch": 0.3850177358263693, + "flos": 24389163152640.0, + "grad_norm": 1.7977164773743752, + "language_loss": 0.6552316, + "learning_rate": 2.817409057859613e-06, + "loss": 0.67724037, + "num_input_tokens_seen": 68968370, + "step": 3202, + "time_per_iteration": 2.495089530944824 + }, + { + "auxiliary_loss_clip": 0.01119938, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.04628098, + "balance_loss_mlp": 1.02043748, + "epoch": 0.38513797871700833, + "flos": 17671533505920.0, + "grad_norm": 1.8553361074795838, + "language_loss": 0.79098183, + "learning_rate": 2.8166980543508482e-06, + "loss": 0.81247342, + "num_input_tokens_seen": 68984260, + "step": 3203, + "time_per_iteration": 2.5306320190429688 + }, + { + "auxiliary_loss_clip": 0.01184912, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.05515623, + "balance_loss_mlp": 1.0195688, + "epoch": 0.38525822160764744, + "flos": 25739583897600.0, + "grad_norm": 2.2440143030083197, + "language_loss": 0.79745102, + "learning_rate": 2.815986926949638e-06, + "loss": 0.81957686, + "num_input_tokens_seen": 69002760, + "step": 3204, + "time_per_iteration": 2.4764270782470703 + }, + { + "auxiliary_loss_clip": 0.01169411, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.05345321, + "balance_loss_mlp": 1.02224088, + "epoch": 0.38537846449828655, + "flos": 20193647898240.0, + "grad_norm": 1.9366310159940756, + "language_loss": 0.80485642, + "learning_rate": 2.8152756757638597e-06, + "loss": 0.82684982, + "num_input_tokens_seen": 69021260, + "step": 3205, + "time_per_iteration": 2.525256633758545 + }, + { + "auxiliary_loss_clip": 0.01168349, + "auxiliary_loss_mlp": 0.01025131, + "balance_loss_clip": 1.05362892, + "balance_loss_mlp": 1.01730502, + "epoch": 0.3854987073889256, + "flos": 23039352938880.0, + "grad_norm": 1.957318819849449, + "language_loss": 0.84805369, + "learning_rate": 2.8145643009014093e-06, + "loss": 0.86998856, + "num_input_tokens_seen": 69039755, + "step": 3206, + "time_per_iteration": 2.469200611114502 + }, + { + "auxiliary_loss_clip": 0.01169757, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.05223334, + "balance_loss_mlp": 1.02062714, + "epoch": 0.3856189502795647, + "flos": 20190631155840.0, + "grad_norm": 1.8727171650986616, + "language_loss": 0.79482269, + "learning_rate": 2.813852802470202e-06, + "loss": 0.81679857, + "num_input_tokens_seen": 69057650, + "step": 3207, + "time_per_iteration": 2.4493305683135986 + }, + { + "auxiliary_loss_clip": 0.01149093, + "auxiliary_loss_mlp": 0.01028307, + "balance_loss_clip": 1.04925859, + "balance_loss_mlp": 1.01988482, + "epoch": 0.38573919317020383, + "flos": 25702631781120.0, + "grad_norm": 1.8564654416961834, + "language_loss": 0.72477341, + "learning_rate": 2.8131411805781717e-06, + "loss": 0.74654734, + "num_input_tokens_seen": 69077775, + "step": 3208, + "time_per_iteration": 2.52998685836792 + }, + { + "auxiliary_loss_clip": 0.0115728, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.05320108, + "balance_loss_mlp": 1.02411771, + "epoch": 0.3858594360608429, + "flos": 29821405628160.0, + "grad_norm": 2.4154988743530725, + "language_loss": 0.64251673, + "learning_rate": 2.8124294353332707e-06, + "loss": 0.66441751, + "num_input_tokens_seen": 69096450, + "step": 3209, + "time_per_iteration": 2.5405120849609375 + }, + { + "auxiliary_loss_clip": 0.01147448, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.05024803, + "balance_loss_mlp": 1.02114272, + "epoch": 0.385979678951482, + "flos": 24790428961920.0, + "grad_norm": 2.0734802543577624, + "language_loss": 0.77002323, + "learning_rate": 2.8117175668434713e-06, + "loss": 0.79178798, + "num_input_tokens_seen": 69116110, + "step": 3210, + "time_per_iteration": 2.569413185119629 + }, + { + "auxiliary_loss_clip": 0.0118342, + "auxiliary_loss_mlp": 0.01025331, + "balance_loss_clip": 1.05286443, + "balance_loss_mlp": 1.01746917, + "epoch": 0.3860999218421211, + "flos": 21287881866240.0, + "grad_norm": 2.156595512260456, + "language_loss": 0.70295525, + "learning_rate": 2.811005575216762e-06, + "loss": 0.72504276, + "num_input_tokens_seen": 69134825, + "step": 3211, + "time_per_iteration": 2.426785469055176 + }, + { + "auxiliary_loss_clip": 0.01136072, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.04940605, + "balance_loss_mlp": 1.02164114, + "epoch": 0.38622016473276016, + "flos": 24536720223360.0, + "grad_norm": 1.4879811042686577, + "language_loss": 0.78962409, + "learning_rate": 2.8102934605611513e-06, + "loss": 0.81127954, + "num_input_tokens_seen": 69156460, + "step": 3212, + "time_per_iteration": 2.630581855773926 + }, + { + "auxiliary_loss_clip": 0.01162519, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.05406499, + "balance_loss_mlp": 1.02010679, + "epoch": 0.3863404076233993, + "flos": 20558212986240.0, + "grad_norm": 1.9334712343041998, + "language_loss": 0.67580163, + "learning_rate": 2.8095812229846665e-06, + "loss": 0.69770491, + "num_input_tokens_seen": 69176420, + "step": 3213, + "time_per_iteration": 2.4756698608398438 + }, + { + "auxiliary_loss_clip": 0.01155649, + "auxiliary_loss_mlp": 0.01027104, + "balance_loss_clip": 1.05022776, + "balance_loss_mlp": 1.01885474, + "epoch": 0.3864606505140384, + "flos": 22346277039360.0, + "grad_norm": 2.2897727708591593, + "language_loss": 0.69092166, + "learning_rate": 2.808868862595355e-06, + "loss": 0.71274924, + "num_input_tokens_seen": 69196665, + "step": 3214, + "time_per_iteration": 2.5003111362457275 + }, + { + "auxiliary_loss_clip": 0.01173008, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.05276656, + "balance_loss_mlp": 1.02217674, + "epoch": 0.38658089340467744, + "flos": 25703601448320.0, + "grad_norm": 1.794278604247436, + "language_loss": 0.79630697, + "learning_rate": 2.8081563795012795e-06, + "loss": 0.81833601, + "num_input_tokens_seen": 69216290, + "step": 3215, + "time_per_iteration": 2.505335569381714 + }, + { + "auxiliary_loss_clip": 0.01162649, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.0503298, + "balance_loss_mlp": 1.01706409, + "epoch": 0.38670113629531655, + "flos": 33802534558080.0, + "grad_norm": 1.7089105332158925, + "language_loss": 0.73840106, + "learning_rate": 2.807443773810524e-06, + "loss": 0.76028097, + "num_input_tokens_seen": 69237550, + "step": 3216, + "time_per_iteration": 3.442344903945923 + }, + { + "auxiliary_loss_clip": 0.01140815, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.0520426, + "balance_loss_mlp": 1.02167737, + "epoch": 0.3868213791859556, + "flos": 23331522165120.0, + "grad_norm": 1.7494564552128187, + "language_loss": 0.89491987, + "learning_rate": 2.80673104563119e-06, + "loss": 0.91662037, + "num_input_tokens_seen": 69258175, + "step": 3217, + "time_per_iteration": 2.5437746047973633 + }, + { + "auxiliary_loss_clip": 0.01167748, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.05380428, + "balance_loss_mlp": 1.01898932, + "epoch": 0.3869416220765947, + "flos": 18441530380800.0, + "grad_norm": 1.8935414531063413, + "language_loss": 0.78779399, + "learning_rate": 2.8060181950713976e-06, + "loss": 0.80973607, + "num_input_tokens_seen": 69274965, + "step": 3218, + "time_per_iteration": 2.445941925048828 + }, + { + "auxiliary_loss_clip": 0.01140586, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.04957235, + "balance_loss_mlp": 1.02179217, + "epoch": 0.3870618649672338, + "flos": 15632992938240.0, + "grad_norm": 2.0349587359910974, + "language_loss": 0.80779105, + "learning_rate": 2.805305222239286e-06, + "loss": 0.82950044, + "num_input_tokens_seen": 69292220, + "step": 3219, + "time_per_iteration": 2.5065672397613525 + }, + { + "auxiliary_loss_clip": 0.01152443, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.05142903, + "balance_loss_mlp": 1.02383995, + "epoch": 0.3871821078578729, + "flos": 23513804709120.0, + "grad_norm": 1.8508080525203485, + "language_loss": 0.74011767, + "learning_rate": 2.8045921272430118e-06, + "loss": 0.76196313, + "num_input_tokens_seen": 69311900, + "step": 3220, + "time_per_iteration": 3.388962984085083 + }, + { + "auxiliary_loss_clip": 0.01177741, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.05307615, + "balance_loss_mlp": 1.02063656, + "epoch": 0.387302350748512, + "flos": 17778259791360.0, + "grad_norm": 2.8917845533421893, + "language_loss": 0.7632134, + "learning_rate": 2.803878910190753e-06, + "loss": 0.7852813, + "num_input_tokens_seen": 69328820, + "step": 3221, + "time_per_iteration": 2.4334919452667236 + }, + { + "auxiliary_loss_clip": 0.01173599, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.0524317, + "balance_loss_mlp": 1.02012825, + "epoch": 0.3874225936391511, + "flos": 11503409097600.0, + "grad_norm": 2.2293242821823527, + "language_loss": 0.82540745, + "learning_rate": 2.8031655711907017e-06, + "loss": 0.84742689, + "num_input_tokens_seen": 69342525, + "step": 3222, + "time_per_iteration": 3.234394073486328 + }, + { + "auxiliary_loss_clip": 0.01176365, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.05663884, + "balance_loss_mlp": 1.02091599, + "epoch": 0.38754283652979016, + "flos": 21945154884480.0, + "grad_norm": 2.0200332788840996, + "language_loss": 0.80665362, + "learning_rate": 2.8024521103510723e-06, + "loss": 0.82870495, + "num_input_tokens_seen": 69359295, + "step": 3223, + "time_per_iteration": 3.2388243675231934 + }, + { + "auxiliary_loss_clip": 0.01168977, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.05062079, + "balance_loss_mlp": 1.01933742, + "epoch": 0.38766307942042927, + "flos": 21175984022400.0, + "grad_norm": 1.6623459373167608, + "language_loss": 0.74894947, + "learning_rate": 2.8017385277800952e-06, + "loss": 0.77090788, + "num_input_tokens_seen": 69377650, + "step": 3224, + "time_per_iteration": 2.4779410362243652 + }, + { + "auxiliary_loss_clip": 0.01147191, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.05246294, + "balance_loss_mlp": 1.02270842, + "epoch": 0.3877833223110684, + "flos": 27417294391680.0, + "grad_norm": 2.1022691550027806, + "language_loss": 0.74999893, + "learning_rate": 2.8010248235860213e-06, + "loss": 0.77177918, + "num_input_tokens_seen": 69397765, + "step": 3225, + "time_per_iteration": 2.5907249450683594 + }, + { + "auxiliary_loss_clip": 0.01064901, + "auxiliary_loss_mlp": 0.00753871, + "balance_loss_clip": 1.01622009, + "balance_loss_mlp": 1.00104749, + "epoch": 0.38790356520170743, + "flos": 64500019879680.0, + "grad_norm": 0.8316356263695989, + "language_loss": 0.62737465, + "learning_rate": 2.8003109978771192e-06, + "loss": 0.64556241, + "num_input_tokens_seen": 69458930, + "step": 3226, + "time_per_iteration": 3.1397085189819336 + }, + { + "auxiliary_loss_clip": 0.01132178, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.04446375, + "balance_loss_mlp": 1.02046585, + "epoch": 0.38802380809234654, + "flos": 22345415112960.0, + "grad_norm": 1.9025751700006426, + "language_loss": 0.78832626, + "learning_rate": 2.799597050761674e-06, + "loss": 0.80993408, + "num_input_tokens_seen": 69475135, + "step": 3227, + "time_per_iteration": 2.5487070083618164 + }, + { + "auxiliary_loss_clip": 0.01185842, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.0546875, + "balance_loss_mlp": 1.02154112, + "epoch": 0.38814405098298566, + "flos": 25261361199360.0, + "grad_norm": 2.2978969370769353, + "language_loss": 0.78845787, + "learning_rate": 2.7988829823479924e-06, + "loss": 0.81061447, + "num_input_tokens_seen": 69493525, + "step": 3228, + "time_per_iteration": 2.4963016510009766 + }, + { + "auxiliary_loss_clip": 0.01151832, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.04949355, + "balance_loss_mlp": 1.02595854, + "epoch": 0.3882642938736247, + "flos": 18841180078080.0, + "grad_norm": 1.8006423740272943, + "language_loss": 0.64153314, + "learning_rate": 2.7981687927443976e-06, + "loss": 0.66340065, + "num_input_tokens_seen": 69510325, + "step": 3229, + "time_per_iteration": 2.518388509750366 + }, + { + "auxiliary_loss_clip": 0.01170466, + "auxiliary_loss_mlp": 0.01024373, + "balance_loss_clip": 1.05142999, + "balance_loss_mlp": 1.01699448, + "epoch": 0.3883845367642638, + "flos": 21652806090240.0, + "grad_norm": 2.1126366535360583, + "language_loss": 0.85848439, + "learning_rate": 2.797454482059231e-06, + "loss": 0.88043278, + "num_input_tokens_seen": 69530480, + "step": 3230, + "time_per_iteration": 2.4896163940429688 + }, + { + "auxiliary_loss_clip": 0.01188312, + "auxiliary_loss_mlp": 0.01022747, + "balance_loss_clip": 1.05571604, + "balance_loss_mlp": 1.0150522, + "epoch": 0.3885047796549029, + "flos": 20557530627840.0, + "grad_norm": 1.6989963090614713, + "language_loss": 0.84420633, + "learning_rate": 2.7967400504008537e-06, + "loss": 0.86631697, + "num_input_tokens_seen": 69549780, + "step": 3231, + "time_per_iteration": 2.447850227355957 + }, + { + "auxiliary_loss_clip": 0.01037035, + "auxiliary_loss_mlp": 0.01001894, + "balance_loss_clip": 1.01485324, + "balance_loss_mlp": 1.00045156, + "epoch": 0.388625022545542, + "flos": 64325491695360.0, + "grad_norm": 0.8022526018366798, + "language_loss": 0.57465744, + "learning_rate": 2.7960254978776456e-06, + "loss": 0.59504676, + "num_input_tokens_seen": 69611870, + "step": 3232, + "time_per_iteration": 3.1105988025665283 + }, + { + "auxiliary_loss_clip": 0.01192168, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.05890322, + "balance_loss_mlp": 1.02764356, + "epoch": 0.3887452654361811, + "flos": 18113881495680.0, + "grad_norm": 2.205559368003149, + "language_loss": 0.81482327, + "learning_rate": 2.7953108245980006e-06, + "loss": 0.83710694, + "num_input_tokens_seen": 69630385, + "step": 3233, + "time_per_iteration": 2.427133321762085 + }, + { + "auxiliary_loss_clip": 0.01150324, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.05208707, + "balance_loss_mlp": 1.02127862, + "epoch": 0.38886550832682015, + "flos": 24975261371520.0, + "grad_norm": 1.732846160751345, + "language_loss": 0.73512185, + "learning_rate": 2.7945960306703365e-06, + "loss": 0.75691414, + "num_input_tokens_seen": 69653370, + "step": 3234, + "time_per_iteration": 2.526604652404785 + }, + { + "auxiliary_loss_clip": 0.0117497, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.05309224, + "balance_loss_mlp": 1.02005076, + "epoch": 0.38898575121745926, + "flos": 27199496275200.0, + "grad_norm": 1.588899129368604, + "language_loss": 0.65811324, + "learning_rate": 2.7938811162030865e-06, + "loss": 0.6801464, + "num_input_tokens_seen": 69673635, + "step": 3235, + "time_per_iteration": 2.509488105773926 + }, + { + "auxiliary_loss_clip": 0.01170715, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.05484247, + "balance_loss_mlp": 1.02483487, + "epoch": 0.3891059941080984, + "flos": 28763728727040.0, + "grad_norm": 1.7277976198209395, + "language_loss": 0.82470143, + "learning_rate": 2.793166081304702e-06, + "loss": 0.84673244, + "num_input_tokens_seen": 69694130, + "step": 3236, + "time_per_iteration": 2.5227341651916504 + }, + { + "auxiliary_loss_clip": 0.01147911, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.04874611, + "balance_loss_mlp": 1.02245021, + "epoch": 0.38922623699873743, + "flos": 22893447893760.0, + "grad_norm": 2.0276868238012424, + "language_loss": 0.8219837, + "learning_rate": 2.7924509260836543e-06, + "loss": 0.84377313, + "num_input_tokens_seen": 69713255, + "step": 3237, + "time_per_iteration": 2.543348550796509 + }, + { + "auxiliary_loss_clip": 0.0114166, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.04836845, + "balance_loss_mlp": 1.01775575, + "epoch": 0.38934647988937654, + "flos": 19792418002560.0, + "grad_norm": 2.050684782806791, + "language_loss": 0.68545651, + "learning_rate": 2.791735650648431e-06, + "loss": 0.70713162, + "num_input_tokens_seen": 69732375, + "step": 3238, + "time_per_iteration": 2.5303473472595215 + }, + { + "auxiliary_loss_clip": 0.01155465, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.05193543, + "balance_loss_mlp": 1.01992679, + "epoch": 0.38946672278001565, + "flos": 19202081978880.0, + "grad_norm": 1.950539965555811, + "language_loss": 0.74656129, + "learning_rate": 2.791020255107538e-06, + "loss": 0.76839209, + "num_input_tokens_seen": 69749745, + "step": 3239, + "time_per_iteration": 2.480377674102783 + }, + { + "auxiliary_loss_clip": 0.01137207, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.04646337, + "balance_loss_mlp": 1.01620817, + "epoch": 0.3895869656706547, + "flos": 24936477661440.0, + "grad_norm": 1.5719149804481403, + "language_loss": 0.80506647, + "learning_rate": 2.7903047395695023e-06, + "loss": 0.82667708, + "num_input_tokens_seen": 69769645, + "step": 3240, + "time_per_iteration": 2.578160285949707 + }, + { + "auxiliary_loss_clip": 0.01171727, + "auxiliary_loss_mlp": 0.00763544, + "balance_loss_clip": 1.05551887, + "balance_loss_mlp": 1.00050211, + "epoch": 0.3897072085612938, + "flos": 24133622820480.0, + "grad_norm": 2.2504149998124725, + "language_loss": 0.90364242, + "learning_rate": 2.789589104142865e-06, + "loss": 0.92299509, + "num_input_tokens_seen": 69787270, + "step": 3241, + "time_per_iteration": 2.501742362976074 + }, + { + "auxiliary_loss_clip": 0.01147688, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.05282593, + "balance_loss_mlp": 1.02234602, + "epoch": 0.3898274514519329, + "flos": 17166342672000.0, + "grad_norm": 1.636412462729543, + "language_loss": 0.76452243, + "learning_rate": 2.7888733489361895e-06, + "loss": 0.78630471, + "num_input_tokens_seen": 69805685, + "step": 3242, + "time_per_iteration": 3.352848768234253 + }, + { + "auxiliary_loss_clip": 0.01082728, + "auxiliary_loss_mlp": 0.0100187, + "balance_loss_clip": 1.01639056, + "balance_loss_mlp": 1.0006541, + "epoch": 0.389947694342572, + "flos": 66074807952000.0, + "grad_norm": 0.7385728826113218, + "language_loss": 0.58735919, + "learning_rate": 2.788157474058054e-06, + "loss": 0.6082052, + "num_input_tokens_seen": 69867960, + "step": 3243, + "time_per_iteration": 3.1290664672851562 + }, + { + "auxiliary_loss_clip": 0.01183456, + "auxiliary_loss_mlp": 0.01025089, + "balance_loss_clip": 1.05488944, + "balance_loss_mlp": 1.01723337, + "epoch": 0.3900679372332111, + "flos": 25740912700800.0, + "grad_norm": 2.033501168117817, + "language_loss": 0.69751221, + "learning_rate": 2.7874414796170555e-06, + "loss": 0.7195977, + "num_input_tokens_seen": 69889450, + "step": 3244, + "time_per_iteration": 2.499852180480957 + }, + { + "auxiliary_loss_clip": 0.0116801, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0513339, + "balance_loss_mlp": 1.02149379, + "epoch": 0.3901881801238502, + "flos": 11801611808640.0, + "grad_norm": 2.7595706859646376, + "language_loss": 0.83891404, + "learning_rate": 2.7867253657218113e-06, + "loss": 0.86089897, + "num_input_tokens_seen": 69903340, + "step": 3245, + "time_per_iteration": 2.443608045578003 + }, + { + "auxiliary_loss_clip": 0.01153751, + "auxiliary_loss_mlp": 0.0076292, + "balance_loss_clip": 1.04865181, + "balance_loss_mlp": 1.00037992, + "epoch": 0.39030842301448926, + "flos": 27308951994240.0, + "grad_norm": 1.6194609997213272, + "language_loss": 0.73074603, + "learning_rate": 2.7860091324809544e-06, + "loss": 0.74991274, + "num_input_tokens_seen": 69924400, + "step": 3246, + "time_per_iteration": 3.4300479888916016 + }, + { + "auxiliary_loss_clip": 0.01169921, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.05628026, + "balance_loss_mlp": 1.01642418, + "epoch": 0.39042866590512837, + "flos": 27163334257920.0, + "grad_norm": 1.6558130652737997, + "language_loss": 0.81046462, + "learning_rate": 2.7852927800031377e-06, + "loss": 0.83240759, + "num_input_tokens_seen": 69944565, + "step": 3247, + "time_per_iteration": 2.5320394039154053 + }, + { + "auxiliary_loss_clip": 0.01160907, + "auxiliary_loss_mlp": 0.01026287, + "balance_loss_clip": 1.05256021, + "balance_loss_mlp": 1.0188725, + "epoch": 0.3905489087957674, + "flos": 29716115886720.0, + "grad_norm": 1.6210112064440096, + "language_loss": 0.82835734, + "learning_rate": 2.7845763083970298e-06, + "loss": 0.85022926, + "num_input_tokens_seen": 69964965, + "step": 3248, + "time_per_iteration": 2.5642549991607666 + }, + { + "auxiliary_loss_clip": 0.01162749, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.05101204, + "balance_loss_mlp": 1.01803303, + "epoch": 0.39066915168640653, + "flos": 24498618871680.0, + "grad_norm": 1.7764446345116056, + "language_loss": 0.81763041, + "learning_rate": 2.7838597177713205e-06, + "loss": 0.83952105, + "num_input_tokens_seen": 69986055, + "step": 3249, + "time_per_iteration": 3.9981529712677 + }, + { + "auxiliary_loss_clip": 0.01103956, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.04911184, + "balance_loss_mlp": 1.02456367, + "epoch": 0.39078939457704565, + "flos": 20558572122240.0, + "grad_norm": 1.6756429597870666, + "language_loss": 0.73753262, + "learning_rate": 2.7831430082347143e-06, + "loss": 0.75890934, + "num_input_tokens_seen": 70005260, + "step": 3250, + "time_per_iteration": 2.58373761177063 + }, + { + "auxiliary_loss_clip": 0.0117313, + "auxiliary_loss_mlp": 0.0076195, + "balance_loss_clip": 1.05479407, + "balance_loss_mlp": 1.0004319, + "epoch": 0.3909096374676847, + "flos": 22783417557120.0, + "grad_norm": 2.2673661345969656, + "language_loss": 0.82241976, + "learning_rate": 2.7824261798959373e-06, + "loss": 0.84177053, + "num_input_tokens_seen": 70023440, + "step": 3251, + "time_per_iteration": 2.4760334491729736 + }, + { + "auxiliary_loss_clip": 0.01159385, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.04956758, + "balance_loss_mlp": 1.02312326, + "epoch": 0.3910298803583238, + "flos": 23003119094400.0, + "grad_norm": 3.302233109838475, + "language_loss": 0.79711246, + "learning_rate": 2.78170923286373e-06, + "loss": 0.81901896, + "num_input_tokens_seen": 70043040, + "step": 3252, + "time_per_iteration": 2.503570795059204 + }, + { + "auxiliary_loss_clip": 0.01095594, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.04935503, + "balance_loss_mlp": 1.02100134, + "epoch": 0.3911501232489629, + "flos": 24316264500480.0, + "grad_norm": 2.524476181282204, + "language_loss": 0.84054238, + "learning_rate": 2.780992167246854e-06, + "loss": 0.86179155, + "num_input_tokens_seen": 70060565, + "step": 3253, + "time_per_iteration": 2.6338977813720703 + }, + { + "auxiliary_loss_clip": 0.01063074, + "auxiliary_loss_mlp": 0.01000957, + "balance_loss_clip": 1.01545167, + "balance_loss_mlp": 0.99963409, + "epoch": 0.391270366139602, + "flos": 60869054684160.0, + "grad_norm": 0.9770640126092774, + "language_loss": 0.7217567, + "learning_rate": 2.7802749831540883e-06, + "loss": 0.74239695, + "num_input_tokens_seen": 70119465, + "step": 3254, + "time_per_iteration": 3.1400845050811768 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01024722, + "balance_loss_clip": 1.05231929, + "balance_loss_mlp": 1.01810563, + "epoch": 0.3913906090302411, + "flos": 21543494025600.0, + "grad_norm": 1.8329094666211643, + "language_loss": 0.81702727, + "learning_rate": 2.7795576806942268e-06, + "loss": 0.83863163, + "num_input_tokens_seen": 70138270, + "step": 3255, + "time_per_iteration": 2.5657639503479004 + }, + { + "auxiliary_loss_clip": 0.01065919, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_clip": 1.0305686, + "balance_loss_mlp": 1.01323438, + "epoch": 0.3915108519208802, + "flos": 49839953702400.0, + "grad_norm": 0.7636547175130955, + "language_loss": 0.54911411, + "learning_rate": 2.778840259976085e-06, + "loss": 0.56992459, + "num_input_tokens_seen": 70193500, + "step": 3256, + "time_per_iteration": 3.0667998790740967 + }, + { + "auxiliary_loss_clip": 0.01172861, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.05391896, + "balance_loss_mlp": 1.02144861, + "epoch": 0.39163109481151925, + "flos": 16506447960960.0, + "grad_norm": 2.222199025902046, + "language_loss": 0.76880467, + "learning_rate": 2.778122721108495e-06, + "loss": 0.79082906, + "num_input_tokens_seen": 70211730, + "step": 3257, + "time_per_iteration": 2.4855129718780518 + }, + { + "auxiliary_loss_clip": 0.01169594, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.05525732, + "balance_loss_mlp": 1.01809812, + "epoch": 0.39175133770215836, + "flos": 26067484177920.0, + "grad_norm": 1.864806882877144, + "language_loss": 0.88906801, + "learning_rate": 2.7774050642003076e-06, + "loss": 0.91102219, + "num_input_tokens_seen": 70232540, + "step": 3258, + "time_per_iteration": 2.519649028778076 + }, + { + "auxiliary_loss_clip": 0.01189462, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.05702865, + "balance_loss_mlp": 1.02312708, + "epoch": 0.3918715805927975, + "flos": 21872076664320.0, + "grad_norm": 2.824304961261037, + "language_loss": 0.93794262, + "learning_rate": 2.7766872893603896e-06, + "loss": 0.96015573, + "num_input_tokens_seen": 70252515, + "step": 3259, + "time_per_iteration": 2.465919017791748 + }, + { + "auxiliary_loss_clip": 0.01170958, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.05340433, + "balance_loss_mlp": 1.02673042, + "epoch": 0.39199182348343653, + "flos": 20376181837440.0, + "grad_norm": 1.8652807224471422, + "language_loss": 0.72857916, + "learning_rate": 2.7759693966976275e-06, + "loss": 0.7506308, + "num_input_tokens_seen": 70271020, + "step": 3260, + "time_per_iteration": 2.4618334770202637 + }, + { + "auxiliary_loss_clip": 0.01139869, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.04971039, + "balance_loss_mlp": 1.02138758, + "epoch": 0.39211206637407564, + "flos": 21683545153920.0, + "grad_norm": 1.8205055729924071, + "language_loss": 0.85115933, + "learning_rate": 2.7752513863209242e-06, + "loss": 0.87285715, + "num_input_tokens_seen": 70289600, + "step": 3261, + "time_per_iteration": 2.5293543338775635 + }, + { + "auxiliary_loss_clip": 0.01151067, + "auxiliary_loss_mlp": 0.00762549, + "balance_loss_clip": 1.05329037, + "balance_loss_mlp": 1.00041938, + "epoch": 0.39223230926471475, + "flos": 21066276908160.0, + "grad_norm": 1.8292048440569295, + "language_loss": 0.84580135, + "learning_rate": 2.774533258339203e-06, + "loss": 0.86493748, + "num_input_tokens_seen": 70307060, + "step": 3262, + "time_per_iteration": 2.505244493484497 + }, + { + "auxiliary_loss_clip": 0.01127517, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.04422212, + "balance_loss_mlp": 1.02389729, + "epoch": 0.3923525521553538, + "flos": 17603016312960.0, + "grad_norm": 2.1113699509031942, + "language_loss": 0.7951684, + "learning_rate": 2.7738150128614014e-06, + "loss": 0.81677282, + "num_input_tokens_seen": 70324465, + "step": 3263, + "time_per_iteration": 2.55241322517395 + }, + { + "auxiliary_loss_clip": 0.01136683, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.05254436, + "balance_loss_mlp": 1.02444577, + "epoch": 0.3924727950459929, + "flos": 20558284813440.0, + "grad_norm": 1.6557518437488754, + "language_loss": 0.89292258, + "learning_rate": 2.7730966499964777e-06, + "loss": 0.91461545, + "num_input_tokens_seen": 70341415, + "step": 3264, + "time_per_iteration": 2.530471086502075 + }, + { + "auxiliary_loss_clip": 0.01189101, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.05687809, + "balance_loss_mlp": 1.02475333, + "epoch": 0.39259303793663197, + "flos": 16216110328320.0, + "grad_norm": 2.5280735980282953, + "language_loss": 0.80623281, + "learning_rate": 2.772378169853408e-06, + "loss": 0.82845283, + "num_input_tokens_seen": 70358985, + "step": 3265, + "time_per_iteration": 2.406672716140747 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.05250764, + "balance_loss_mlp": 1.02130437, + "epoch": 0.3927132808272711, + "flos": 16797001075200.0, + "grad_norm": 1.6516614693086602, + "language_loss": 0.74094671, + "learning_rate": 2.771659572541183e-06, + "loss": 0.76267505, + "num_input_tokens_seen": 70376915, + "step": 3266, + "time_per_iteration": 2.5126864910125732 + }, + { + "auxiliary_loss_clip": 0.01176372, + "auxiliary_loss_mlp": 0.01026713, + "balance_loss_clip": 1.05828738, + "balance_loss_mlp": 1.01848209, + "epoch": 0.3928335237179102, + "flos": 20267228908800.0, + "grad_norm": 3.2506458650837264, + "language_loss": 0.86854154, + "learning_rate": 2.7709408581688143e-06, + "loss": 0.89057237, + "num_input_tokens_seen": 70396900, + "step": 3267, + "time_per_iteration": 2.467857599258423 + }, + { + "auxiliary_loss_clip": 0.01153162, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.05427825, + "balance_loss_mlp": 1.02444792, + "epoch": 0.39295376660854925, + "flos": 24973250209920.0, + "grad_norm": 2.6326070050291, + "language_loss": 0.8742345, + "learning_rate": 2.7702220268453307e-06, + "loss": 0.89608765, + "num_input_tokens_seen": 70417260, + "step": 3268, + "time_per_iteration": 2.5738706588745117 + }, + { + "auxiliary_loss_clip": 0.01158392, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.05179596, + "balance_loss_mlp": 1.02229166, + "epoch": 0.39307400949918836, + "flos": 18697788984960.0, + "grad_norm": 1.9635488571225173, + "language_loss": 0.85025442, + "learning_rate": 2.7695030786797785e-06, + "loss": 0.87214512, + "num_input_tokens_seen": 70433155, + "step": 3269, + "time_per_iteration": 3.3012287616729736 + }, + { + "auxiliary_loss_clip": 0.01123626, + "auxiliary_loss_mlp": 0.0102671, + "balance_loss_clip": 1.04863238, + "balance_loss_mlp": 1.01891422, + "epoch": 0.39319425238982747, + "flos": 22415476590720.0, + "grad_norm": 2.439163504877315, + "language_loss": 0.74610567, + "learning_rate": 2.7687840137812206e-06, + "loss": 0.76760912, + "num_input_tokens_seen": 70451240, + "step": 3270, + "time_per_iteration": 2.560053825378418 + }, + { + "auxiliary_loss_clip": 0.01066677, + "auxiliary_loss_mlp": 0.01009795, + "balance_loss_clip": 1.01438951, + "balance_loss_mlp": 1.00873423, + "epoch": 0.3933144952804665, + "flos": 66192954762240.0, + "grad_norm": 0.8055337839960424, + "language_loss": 0.62112021, + "learning_rate": 2.7680648322587395e-06, + "loss": 0.64188492, + "num_input_tokens_seen": 70516115, + "step": 3271, + "time_per_iteration": 3.0788683891296387 + }, + { + "auxiliary_loss_clip": 0.01184828, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.05506277, + "balance_loss_mlp": 1.01798701, + "epoch": 0.39343473817110564, + "flos": 15487159720320.0, + "grad_norm": 2.014715414844573, + "language_loss": 0.81232679, + "learning_rate": 2.7673455342214334e-06, + "loss": 0.83442754, + "num_input_tokens_seen": 70533105, + "step": 3272, + "time_per_iteration": 3.252774238586426 + }, + { + "auxiliary_loss_clip": 0.01171341, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.05465829, + "balance_loss_mlp": 1.021842, + "epoch": 0.39355498106174475, + "flos": 21324905809920.0, + "grad_norm": 1.7149714659231003, + "language_loss": 0.76088572, + "learning_rate": 2.7666261197784198e-06, + "loss": 0.78289151, + "num_input_tokens_seen": 70551920, + "step": 3273, + "time_per_iteration": 2.4684314727783203 + }, + { + "auxiliary_loss_clip": 0.01154392, + "auxiliary_loss_mlp": 0.01027443, + "balance_loss_clip": 1.05688882, + "balance_loss_mlp": 1.01951253, + "epoch": 0.3936752239523838, + "flos": 13296357400320.0, + "grad_norm": 1.8535559958076178, + "language_loss": 0.76550823, + "learning_rate": 2.7659065890388336e-06, + "loss": 0.78732657, + "num_input_tokens_seen": 70567920, + "step": 3274, + "time_per_iteration": 2.4735236167907715 + }, + { + "auxiliary_loss_clip": 0.0116069, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.05290723, + "balance_loss_mlp": 1.02391624, + "epoch": 0.3937954668430229, + "flos": 16800161472000.0, + "grad_norm": 1.8541142995945012, + "language_loss": 0.84696734, + "learning_rate": 2.7651869421118266e-06, + "loss": 0.86889541, + "num_input_tokens_seen": 70584530, + "step": 3275, + "time_per_iteration": 2.4858474731445312 + }, + { + "auxiliary_loss_clip": 0.01177781, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.05752325, + "balance_loss_mlp": 1.02223027, + "epoch": 0.393915709733662, + "flos": 21064229832960.0, + "grad_norm": 1.8418147506381601, + "language_loss": 0.82987547, + "learning_rate": 2.76446717910657e-06, + "loss": 0.85195422, + "num_input_tokens_seen": 70605235, + "step": 3276, + "time_per_iteration": 4.063091516494751 + }, + { + "auxiliary_loss_clip": 0.01169298, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.05397987, + "balance_loss_mlp": 1.0208652, + "epoch": 0.3940359526243011, + "flos": 17165265264000.0, + "grad_norm": 2.278352460426256, + "language_loss": 0.76940465, + "learning_rate": 2.763747300132249e-06, + "loss": 0.79138058, + "num_input_tokens_seen": 70622675, + "step": 3277, + "time_per_iteration": 2.530925750732422 + }, + { + "auxiliary_loss_clip": 0.01186039, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.05651355, + "balance_loss_mlp": 1.01966929, + "epoch": 0.3941561955149402, + "flos": 20995856294400.0, + "grad_norm": 2.4553229721494967, + "language_loss": 0.86797357, + "learning_rate": 2.7630273052980704e-06, + "loss": 0.89010721, + "num_input_tokens_seen": 70643265, + "step": 3278, + "time_per_iteration": 2.4568777084350586 + }, + { + "auxiliary_loss_clip": 0.01147155, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.05091381, + "balance_loss_mlp": 1.02043939, + "epoch": 0.39427643840557924, + "flos": 18843406721280.0, + "grad_norm": 2.1139731394065517, + "language_loss": 0.67189413, + "learning_rate": 2.7623071947132554e-06, + "loss": 0.69364685, + "num_input_tokens_seen": 70660295, + "step": 3279, + "time_per_iteration": 2.4969375133514404 + }, + { + "auxiliary_loss_clip": 0.0116263, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.0519774, + "balance_loss_mlp": 1.02210319, + "epoch": 0.39439668129621835, + "flos": 23258659426560.0, + "grad_norm": 2.0309250766719122, + "language_loss": 0.78960013, + "learning_rate": 2.7615869684870458e-06, + "loss": 0.81152523, + "num_input_tokens_seen": 70679605, + "step": 3280, + "time_per_iteration": 2.5203399658203125 + }, + { + "auxiliary_loss_clip": 0.0116898, + "auxiliary_loss_mlp": 0.01025797, + "balance_loss_clip": 1.05459476, + "balance_loss_mlp": 1.01813805, + "epoch": 0.39451692418685746, + "flos": 26652289507200.0, + "grad_norm": 1.6496987635799056, + "language_loss": 0.84804529, + "learning_rate": 2.7608666267286986e-06, + "loss": 0.86999303, + "num_input_tokens_seen": 70699835, + "step": 3281, + "time_per_iteration": 2.522454023361206 + }, + { + "auxiliary_loss_clip": 0.01110992, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.04392552, + "balance_loss_mlp": 1.02028084, + "epoch": 0.3946371670774965, + "flos": 18258709132800.0, + "grad_norm": 2.1428145301837414, + "language_loss": 0.8665821, + "learning_rate": 2.760146169547489e-06, + "loss": 0.88798082, + "num_input_tokens_seen": 70716600, + "step": 3282, + "time_per_iteration": 2.5824480056762695 + }, + { + "auxiliary_loss_clip": 0.01159153, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.056229, + "balance_loss_mlp": 1.0214746, + "epoch": 0.39475740996813563, + "flos": 24206126423040.0, + "grad_norm": 1.4804869744378997, + "language_loss": 0.76509339, + "learning_rate": 2.75942559705271e-06, + "loss": 0.78697741, + "num_input_tokens_seen": 70736335, + "step": 3283, + "time_per_iteration": 2.5750155448913574 + }, + { + "auxiliary_loss_clip": 0.01168846, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.05418956, + "balance_loss_mlp": 1.0238657, + "epoch": 0.39487765285877474, + "flos": 19317858491520.0, + "grad_norm": 1.885595811139525, + "language_loss": 0.89396888, + "learning_rate": 2.7587049093536713e-06, + "loss": 0.91597342, + "num_input_tokens_seen": 70752665, + "step": 3284, + "time_per_iteration": 2.457979679107666 + }, + { + "auxiliary_loss_clip": 0.01175037, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.05383873, + "balance_loss_mlp": 1.02561891, + "epoch": 0.3949978957494138, + "flos": 17311744926720.0, + "grad_norm": 1.828702407432217, + "language_loss": 0.80783391, + "learning_rate": 2.757984106559701e-06, + "loss": 0.82992136, + "num_input_tokens_seen": 70771650, + "step": 3285, + "time_per_iteration": 2.451009750366211 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.05323124, + "balance_loss_mlp": 1.02134013, + "epoch": 0.3951181386400529, + "flos": 36317861280000.0, + "grad_norm": 2.131549892746281, + "language_loss": 0.71350396, + "learning_rate": 2.7572631887801446e-06, + "loss": 0.73530793, + "num_input_tokens_seen": 70793275, + "step": 3286, + "time_per_iteration": 2.6279146671295166 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.05439973, + "balance_loss_mlp": 1.02444983, + "epoch": 0.395238381530692, + "flos": 23110348170240.0, + "grad_norm": 1.736145692358794, + "language_loss": 0.76463598, + "learning_rate": 2.7565421561243654e-06, + "loss": 0.78669786, + "num_input_tokens_seen": 70811440, + "step": 3287, + "time_per_iteration": 2.483351230621338 + }, + { + "auxiliary_loss_clip": 0.01135227, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.04888797, + "balance_loss_mlp": 1.01925969, + "epoch": 0.3953586244213311, + "flos": 24347614095360.0, + "grad_norm": 2.104480105140653, + "language_loss": 0.82103282, + "learning_rate": 2.7558210087017413e-06, + "loss": 0.8426578, + "num_input_tokens_seen": 70831375, + "step": 3288, + "time_per_iteration": 2.595792531967163 + }, + { + "auxiliary_loss_clip": 0.01140659, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.05560422, + "balance_loss_mlp": 1.02059579, + "epoch": 0.3954788673119702, + "flos": 23440080044160.0, + "grad_norm": 1.7170317189933366, + "language_loss": 0.73595458, + "learning_rate": 2.7550997466216724e-06, + "loss": 0.75765437, + "num_input_tokens_seen": 70849170, + "step": 3289, + "time_per_iteration": 2.534391403198242 + }, + { + "auxiliary_loss_clip": 0.01156417, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.05793762, + "balance_loss_mlp": 1.02278113, + "epoch": 0.3955991102026093, + "flos": 17494063384320.0, + "grad_norm": 1.795017574319635, + "language_loss": 0.81494516, + "learning_rate": 2.7543783699935714e-06, + "loss": 0.83681697, + "num_input_tokens_seen": 70867200, + "step": 3290, + "time_per_iteration": 2.4821085929870605 + }, + { + "auxiliary_loss_clip": 0.0117174, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.05730796, + "balance_loss_mlp": 1.02088094, + "epoch": 0.39571935309324835, + "flos": 18221326053120.0, + "grad_norm": 3.3685546131809874, + "language_loss": 0.86355239, + "learning_rate": 2.753656878926872e-06, + "loss": 0.8855629, + "num_input_tokens_seen": 70883080, + "step": 3291, + "time_per_iteration": 2.444225549697876 + }, + { + "auxiliary_loss_clip": 0.01145082, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.04885352, + "balance_loss_mlp": 1.01986361, + "epoch": 0.39583959598388746, + "flos": 17748813617280.0, + "grad_norm": 1.757528766332802, + "language_loss": 0.74279284, + "learning_rate": 2.752935273531023e-06, + "loss": 0.76452774, + "num_input_tokens_seen": 70901230, + "step": 3292, + "time_per_iteration": 2.4704136848449707 + }, + { + "auxiliary_loss_clip": 0.01172147, + "auxiliary_loss_mlp": 0.01026466, + "balance_loss_clip": 1.05391765, + "balance_loss_mlp": 1.01784158, + "epoch": 0.39595983887452657, + "flos": 19352368483200.0, + "grad_norm": 1.9902570651557974, + "language_loss": 0.78126264, + "learning_rate": 2.752213553915492e-06, + "loss": 0.80324876, + "num_input_tokens_seen": 70919585, + "step": 3293, + "time_per_iteration": 2.4594690799713135 + }, + { + "auxiliary_loss_clip": 0.01060207, + "auxiliary_loss_mlp": 0.01005951, + "balance_loss_clip": 1.01696682, + "balance_loss_mlp": 1.00484204, + "epoch": 0.3960800817651656, + "flos": 60682282940160.0, + "grad_norm": 0.8258775305275187, + "language_loss": 0.66093391, + "learning_rate": 2.751491720189762e-06, + "loss": 0.6815955, + "num_input_tokens_seen": 70977695, + "step": 3294, + "time_per_iteration": 3.0303730964660645 + }, + { + "auxiliary_loss_clip": 0.01155984, + "auxiliary_loss_mlp": 0.00762895, + "balance_loss_clip": 1.05299711, + "balance_loss_mlp": 1.00045896, + "epoch": 0.39620032465580474, + "flos": 16836718538880.0, + "grad_norm": 2.099570115804247, + "language_loss": 0.91771901, + "learning_rate": 2.7507697724633364e-06, + "loss": 0.93690777, + "num_input_tokens_seen": 70994455, + "step": 3295, + "time_per_iteration": 3.3124825954437256 + }, + { + "auxiliary_loss_clip": 0.01050161, + "auxiliary_loss_mlp": 0.01019746, + "balance_loss_clip": 1.02417922, + "balance_loss_mlp": 1.01763618, + "epoch": 0.3963205675464438, + "flos": 69071445941760.0, + "grad_norm": 0.779361968435171, + "language_loss": 0.54674315, + "learning_rate": 2.7500477108457327e-06, + "loss": 0.56744224, + "num_input_tokens_seen": 71046465, + "step": 3296, + "time_per_iteration": 2.927067279815674 + }, + { + "auxiliary_loss_clip": 0.0112561, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.04574406, + "balance_loss_mlp": 1.02032816, + "epoch": 0.3964408104370829, + "flos": 25667439431040.0, + "grad_norm": 1.8923014506886564, + "language_loss": 0.80510932, + "learning_rate": 2.7493255354464877e-06, + "loss": 0.82665831, + "num_input_tokens_seen": 71064275, + "step": 3297, + "time_per_iteration": 2.58343243598938 + }, + { + "auxiliary_loss_clip": 0.01099265, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.04591966, + "balance_loss_mlp": 1.01944637, + "epoch": 0.396561053327722, + "flos": 24277480790400.0, + "grad_norm": 2.2940253043828736, + "language_loss": 0.76256281, + "learning_rate": 2.748603246375156e-06, + "loss": 0.78382897, + "num_input_tokens_seen": 71082290, + "step": 3298, + "time_per_iteration": 2.6529412269592285 + }, + { + "auxiliary_loss_clip": 0.01185318, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.05652261, + "balance_loss_mlp": 1.02679586, + "epoch": 0.39668129621836107, + "flos": 20522302364160.0, + "grad_norm": 2.2860200728620304, + "language_loss": 0.70224375, + "learning_rate": 2.7478808437413055e-06, + "loss": 0.72444689, + "num_input_tokens_seen": 71101700, + "step": 3299, + "time_per_iteration": 3.29171085357666 + }, + { + "auxiliary_loss_clip": 0.01131616, + "auxiliary_loss_mlp": 0.01027461, + "balance_loss_clip": 1.05585361, + "balance_loss_mlp": 1.01921213, + "epoch": 0.3968015391090002, + "flos": 27052585649280.0, + "grad_norm": 1.8374207464210466, + "language_loss": 0.66066754, + "learning_rate": 2.7471583276545263e-06, + "loss": 0.68225831, + "num_input_tokens_seen": 71122360, + "step": 3300, + "time_per_iteration": 2.6148300170898438 + }, + { + "auxiliary_loss_clip": 0.0115726, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.0520978, + "balance_loss_mlp": 1.0227536, + "epoch": 0.3969217819996393, + "flos": 12531819392640.0, + "grad_norm": 2.314079434148456, + "language_loss": 0.70782506, + "learning_rate": 2.7464356982244224e-06, + "loss": 0.72970188, + "num_input_tokens_seen": 71140360, + "step": 3301, + "time_per_iteration": 2.4854118824005127 + }, + { + "auxiliary_loss_clip": 0.01074202, + "auxiliary_loss_mlp": 0.01004612, + "balance_loss_clip": 1.02616191, + "balance_loss_mlp": 1.0026803, + "epoch": 0.39704202489027834, + "flos": 66241399230720.0, + "grad_norm": 0.7736484151504005, + "language_loss": 0.61720312, + "learning_rate": 2.745712955560617e-06, + "loss": 0.63799131, + "num_input_tokens_seen": 71196565, + "step": 3302, + "time_per_iteration": 3.7873544692993164 + }, + { + "auxiliary_loss_clip": 0.01113514, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.04912424, + "balance_loss_mlp": 1.02062714, + "epoch": 0.39716226778091746, + "flos": 16982982720000.0, + "grad_norm": 3.027195481383852, + "language_loss": 0.77458537, + "learning_rate": 2.7449900997727496e-06, + "loss": 0.79601711, + "num_input_tokens_seen": 71214675, + "step": 3303, + "time_per_iteration": 3.357720375061035 + }, + { + "auxiliary_loss_clip": 0.01156318, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.05588877, + "balance_loss_mlp": 1.02373898, + "epoch": 0.39728251067155657, + "flos": 23477139901440.0, + "grad_norm": 2.061190945910885, + "language_loss": 0.84094739, + "learning_rate": 2.744267130970476e-06, + "loss": 0.86282444, + "num_input_tokens_seen": 71234400, + "step": 3304, + "time_per_iteration": 2.509260892868042 + }, + { + "auxiliary_loss_clip": 0.01153412, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.05415094, + "balance_loss_mlp": 1.02217698, + "epoch": 0.3974027535621956, + "flos": 20704441253760.0, + "grad_norm": 1.738133313676558, + "language_loss": 0.77289069, + "learning_rate": 2.7435440492634697e-06, + "loss": 0.7947337, + "num_input_tokens_seen": 71253725, + "step": 3305, + "time_per_iteration": 2.5091795921325684 + }, + { + "auxiliary_loss_clip": 0.01155957, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.05122185, + "balance_loss_mlp": 1.02337551, + "epoch": 0.39752299645283473, + "flos": 21543278544000.0, + "grad_norm": 2.6393113719047503, + "language_loss": 0.67367053, + "learning_rate": 2.7428208547614228e-06, + "loss": 0.69555199, + "num_input_tokens_seen": 71273220, + "step": 3306, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01173746, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.05654716, + "balance_loss_mlp": 1.02468312, + "epoch": 0.39764323934347384, + "flos": 19208295031680.0, + "grad_norm": 1.9824610939563567, + "language_loss": 0.77812493, + "learning_rate": 2.742097547574043e-06, + "loss": 0.80018967, + "num_input_tokens_seen": 71291445, + "step": 3307, + "time_per_iteration": 2.4473907947540283 + }, + { + "auxiliary_loss_clip": 0.01162014, + "auxiliary_loss_mlp": 0.00762786, + "balance_loss_clip": 1.05259252, + "balance_loss_mlp": 1.00036943, + "epoch": 0.3977634822341129, + "flos": 20850202644480.0, + "grad_norm": 1.9633182423986277, + "language_loss": 0.77651799, + "learning_rate": 2.7413741278110544e-06, + "loss": 0.795766, + "num_input_tokens_seen": 71310135, + "step": 3308, + "time_per_iteration": 2.5101242065429688 + }, + { + "auxiliary_loss_clip": 0.01162292, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.05502284, + "balance_loss_mlp": 1.02219105, + "epoch": 0.397883725124752, + "flos": 39786042038400.0, + "grad_norm": 2.3258358048190773, + "language_loss": 0.68837166, + "learning_rate": 2.7406505955822016e-06, + "loss": 0.71030498, + "num_input_tokens_seen": 71331160, + "step": 3309, + "time_per_iteration": 2.662421941757202 + }, + { + "auxiliary_loss_clip": 0.01157974, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.05229855, + "balance_loss_mlp": 1.02196276, + "epoch": 0.39800396801539106, + "flos": 17379507934080.0, + "grad_norm": 4.099665470047776, + "language_loss": 0.66162884, + "learning_rate": 2.7399269509972415e-06, + "loss": 0.68350959, + "num_input_tokens_seen": 71345315, + "step": 3310, + "time_per_iteration": 2.463991403579712 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.047032, + "balance_loss_mlp": 1.02070713, + "epoch": 0.3981242109060302, + "flos": 19202764337280.0, + "grad_norm": 2.4049500361340743, + "language_loss": 0.85456431, + "learning_rate": 2.7392031941659514e-06, + "loss": 0.87635148, + "num_input_tokens_seen": 71363160, + "step": 3311, + "time_per_iteration": 2.4707274436950684 + }, + { + "auxiliary_loss_clip": 0.01158328, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.05653465, + "balance_loss_mlp": 1.03034306, + "epoch": 0.3982444537966693, + "flos": 24565124903040.0, + "grad_norm": 1.7721481224473024, + "language_loss": 0.85882705, + "learning_rate": 2.7384793251981244e-06, + "loss": 0.88079906, + "num_input_tokens_seen": 71382145, + "step": 3312, + "time_per_iteration": 2.520378589630127 + }, + { + "auxiliary_loss_clip": 0.01177656, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.05459666, + "balance_loss_mlp": 1.02215445, + "epoch": 0.39836469668730834, + "flos": 26213856099840.0, + "grad_norm": 1.8308992717947343, + "language_loss": 0.81013483, + "learning_rate": 2.737755344203571e-06, + "loss": 0.83221328, + "num_input_tokens_seen": 71402095, + "step": 3313, + "time_per_iteration": 2.517637252807617 + }, + { + "auxiliary_loss_clip": 0.01177489, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.05829382, + "balance_loss_mlp": 1.02346861, + "epoch": 0.39848493957794745, + "flos": 27636134002560.0, + "grad_norm": 1.7078794558468287, + "language_loss": 0.79774928, + "learning_rate": 2.7370312512921186e-06, + "loss": 0.81983846, + "num_input_tokens_seen": 71423875, + "step": 3314, + "time_per_iteration": 2.5207886695861816 + }, + { + "auxiliary_loss_clip": 0.01160218, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.05208588, + "balance_loss_mlp": 1.02687252, + "epoch": 0.39860518246858656, + "flos": 12239326944000.0, + "grad_norm": 2.4720947779905376, + "language_loss": 0.76794827, + "learning_rate": 2.736307046573611e-06, + "loss": 0.78991288, + "num_input_tokens_seen": 71439745, + "step": 3315, + "time_per_iteration": 2.466719150543213 + }, + { + "auxiliary_loss_clip": 0.01184243, + "auxiliary_loss_mlp": 0.01025953, + "balance_loss_clip": 1.05461872, + "balance_loss_mlp": 1.01816869, + "epoch": 0.3987254253592256, + "flos": 22379135005440.0, + "grad_norm": 1.6471847826049513, + "language_loss": 0.82011878, + "learning_rate": 2.73558273015791e-06, + "loss": 0.84222078, + "num_input_tokens_seen": 71459575, + "step": 3316, + "time_per_iteration": 2.444368362426758 + }, + { + "auxiliary_loss_clip": 0.01189895, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.05685544, + "balance_loss_mlp": 1.02311206, + "epoch": 0.3988456682498647, + "flos": 23514020190720.0, + "grad_norm": 2.686505729113726, + "language_loss": 0.70428991, + "learning_rate": 2.734858302154894e-06, + "loss": 0.72651255, + "num_input_tokens_seen": 71481075, + "step": 3317, + "time_per_iteration": 2.4510557651519775 + }, + { + "auxiliary_loss_clip": 0.01152969, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.05282629, + "balance_loss_mlp": 1.02468944, + "epoch": 0.39896591114050384, + "flos": 19208761908480.0, + "grad_norm": 2.3979298983238664, + "language_loss": 0.76490843, + "learning_rate": 2.734133762674457e-06, + "loss": 0.78676975, + "num_input_tokens_seen": 71500665, + "step": 3318, + "time_per_iteration": 2.485837697982788 + }, + { + "auxiliary_loss_clip": 0.01159016, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.05365241, + "balance_loss_mlp": 1.02275157, + "epoch": 0.3990861540311429, + "flos": 28401031146240.0, + "grad_norm": 2.025857031041854, + "language_loss": 0.70574152, + "learning_rate": 2.7334091118265124e-06, + "loss": 0.7276448, + "num_input_tokens_seen": 71522560, + "step": 3319, + "time_per_iteration": 2.5475356578826904 + }, + { + "auxiliary_loss_clip": 0.01073373, + "auxiliary_loss_mlp": 0.01011207, + "balance_loss_clip": 1.01629364, + "balance_loss_mlp": 1.01012802, + "epoch": 0.399206396921782, + "flos": 61758563086080.0, + "grad_norm": 0.6793530427591613, + "language_loss": 0.57815641, + "learning_rate": 2.732684349720989e-06, + "loss": 0.59900218, + "num_input_tokens_seen": 71590520, + "step": 3320, + "time_per_iteration": 3.0509047508239746 + }, + { + "auxiliary_loss_clip": 0.01148158, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.05172324, + "balance_loss_mlp": 1.02026963, + "epoch": 0.3993266398124211, + "flos": 28074567409920.0, + "grad_norm": 1.7190728990591573, + "language_loss": 0.75219119, + "learning_rate": 2.7319594764678318e-06, + "loss": 0.77395737, + "num_input_tokens_seen": 71612620, + "step": 3321, + "time_per_iteration": 2.583740711212158 + }, + { + "auxiliary_loss_clip": 0.01134205, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.0516715, + "balance_loss_mlp": 1.02612579, + "epoch": 0.39944688270306017, + "flos": 23225083188480.0, + "grad_norm": 1.7419892730984303, + "language_loss": 0.83222747, + "learning_rate": 2.7312344921770044e-06, + "loss": 0.85391974, + "num_input_tokens_seen": 71634320, + "step": 3322, + "time_per_iteration": 3.4414803981781006 + }, + { + "auxiliary_loss_clip": 0.01156907, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.05087423, + "balance_loss_mlp": 1.02412939, + "epoch": 0.3995671255936993, + "flos": 19390433921280.0, + "grad_norm": 2.1001649743543584, + "language_loss": 0.78482485, + "learning_rate": 2.7305093969584857e-06, + "loss": 0.80671394, + "num_input_tokens_seen": 71653145, + "step": 3323, + "time_per_iteration": 2.519064426422119 + }, + { + "auxiliary_loss_clip": 0.01165869, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.05294657, + "balance_loss_mlp": 1.02350187, + "epoch": 0.3996873684843384, + "flos": 23842638743040.0, + "grad_norm": 2.1214910284084754, + "language_loss": 0.80035973, + "learning_rate": 2.729784190922272e-06, + "loss": 0.8223334, + "num_input_tokens_seen": 71674580, + "step": 3324, + "time_per_iteration": 2.483886957168579 + }, + { + "auxiliary_loss_clip": 0.01062868, + "auxiliary_loss_mlp": 0.0100181, + "balance_loss_clip": 1.01761556, + "balance_loss_mlp": 1.00079083, + "epoch": 0.39980761137497745, + "flos": 66576877280640.0, + "grad_norm": 0.93636568524594, + "language_loss": 0.57192075, + "learning_rate": 2.729058874178378e-06, + "loss": 0.59256756, + "num_input_tokens_seen": 71745260, + "step": 3325, + "time_per_iteration": 3.1218621730804443 + }, + { + "auxiliary_loss_clip": 0.01148908, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.05112278, + "balance_loss_mlp": 1.02364182, + "epoch": 0.39992785426561656, + "flos": 28549162834560.0, + "grad_norm": 2.227029100420481, + "language_loss": 0.69408619, + "learning_rate": 2.7283334468368315e-06, + "loss": 0.71589637, + "num_input_tokens_seen": 71766540, + "step": 3326, + "time_per_iteration": 3.4614078998565674 + }, + { + "auxiliary_loss_clip": 0.01094289, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04281664, + "balance_loss_mlp": 1.02024627, + "epoch": 0.4000480971562556, + "flos": 15049408671360.0, + "grad_norm": 1.794609623628467, + "language_loss": 0.73329663, + "learning_rate": 2.72760790900768e-06, + "loss": 0.75452864, + "num_input_tokens_seen": 71783125, + "step": 3327, + "time_per_iteration": 2.5859737396240234 + }, + { + "auxiliary_loss_clip": 0.01187686, + "auxiliary_loss_mlp": 0.01028176, + "balance_loss_clip": 1.05732679, + "balance_loss_mlp": 1.02002192, + "epoch": 0.4001683400468947, + "flos": 23915609222400.0, + "grad_norm": 3.023837708122538, + "language_loss": 0.78608185, + "learning_rate": 2.7268822608009875e-06, + "loss": 0.80824047, + "num_input_tokens_seen": 71802500, + "step": 3328, + "time_per_iteration": 2.461385726928711 + }, + { + "auxiliary_loss_clip": 0.01148319, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.0515132, + "balance_loss_mlp": 1.02224481, + "epoch": 0.40028858293753383, + "flos": 24352677912960.0, + "grad_norm": 2.002312701284234, + "language_loss": 0.78241062, + "learning_rate": 2.726156502326834e-06, + "loss": 0.80419701, + "num_input_tokens_seen": 71823800, + "step": 3329, + "time_per_iteration": 3.4065518379211426 + }, + { + "auxiliary_loss_clip": 0.01037392, + "auxiliary_loss_mlp": 0.01006301, + "balance_loss_clip": 1.02475178, + "balance_loss_mlp": 1.00453663, + "epoch": 0.4004088258281729, + "flos": 66787025800320.0, + "grad_norm": 0.6954955816553068, + "language_loss": 0.60248381, + "learning_rate": 2.725430633695316e-06, + "loss": 0.62292069, + "num_input_tokens_seen": 71886880, + "step": 3330, + "time_per_iteration": 3.9855246543884277 + }, + { + "auxiliary_loss_clip": 0.01081976, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.01486754, + "balance_loss_mlp": 1.00086582, + "epoch": 0.400529068718812, + "flos": 58598386473600.0, + "grad_norm": 0.8859229209750509, + "language_loss": 0.57941341, + "learning_rate": 2.7247046550165485e-06, + "loss": 0.60025156, + "num_input_tokens_seen": 71939005, + "step": 3331, + "time_per_iteration": 3.096722364425659 + }, + { + "auxiliary_loss_clip": 0.0118897, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.05753112, + "balance_loss_mlp": 1.02147448, + "epoch": 0.4006493116094511, + "flos": 25377460934400.0, + "grad_norm": 1.405170299905092, + "language_loss": 0.76002222, + "learning_rate": 2.7239785664006606e-06, + "loss": 0.78220809, + "num_input_tokens_seen": 71962545, + "step": 3332, + "time_per_iteration": 2.495569944381714 + }, + { + "auxiliary_loss_clip": 0.01070862, + "auxiliary_loss_mlp": 0.01002334, + "balance_loss_clip": 1.01329231, + "balance_loss_mlp": 1.00127268, + "epoch": 0.40076955450009016, + "flos": 60280729822080.0, + "grad_norm": 0.8899218323031897, + "language_loss": 0.6179955, + "learning_rate": 2.7232523679578002e-06, + "loss": 0.63872755, + "num_input_tokens_seen": 72025625, + "step": 3333, + "time_per_iteration": 3.0788426399230957 + }, + { + "auxiliary_loss_clip": 0.01168915, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.05561137, + "balance_loss_mlp": 1.01842427, + "epoch": 0.4008897973907293, + "flos": 16617268396800.0, + "grad_norm": 2.2695694963287236, + "language_loss": 0.79483086, + "learning_rate": 2.7225260597981295e-06, + "loss": 0.81678057, + "num_input_tokens_seen": 72043330, + "step": 3334, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01144179, + "auxiliary_loss_mlp": 0.00763604, + "balance_loss_clip": 1.05355275, + "balance_loss_mlp": 1.0002563, + "epoch": 0.4010100402813684, + "flos": 15377344865280.0, + "grad_norm": 2.8254115072612023, + "language_loss": 0.78385192, + "learning_rate": 2.721799642031831e-06, + "loss": 0.80292976, + "num_input_tokens_seen": 72059500, + "step": 3335, + "time_per_iteration": 2.494485378265381 + }, + { + "auxiliary_loss_clip": 0.01160482, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.04908133, + "balance_loss_mlp": 1.02450252, + "epoch": 0.40113028317200744, + "flos": 13298835438720.0, + "grad_norm": 1.926169863973025, + "language_loss": 0.78056395, + "learning_rate": 2.721073114769101e-06, + "loss": 0.80249709, + "num_input_tokens_seen": 72077175, + "step": 3336, + "time_per_iteration": 2.4852471351623535 + }, + { + "auxiliary_loss_clip": 0.01138502, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.05059814, + "balance_loss_mlp": 1.02478826, + "epoch": 0.40125052606264655, + "flos": 20668027841280.0, + "grad_norm": 1.7518925359226891, + "language_loss": 0.75076759, + "learning_rate": 2.7203464781201523e-06, + "loss": 0.772479, + "num_input_tokens_seen": 72096490, + "step": 3337, + "time_per_iteration": 2.537952423095703 + }, + { + "auxiliary_loss_clip": 0.01187696, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.05787134, + "balance_loss_mlp": 1.02460778, + "epoch": 0.40137076895328566, + "flos": 24607679541120.0, + "grad_norm": 2.6937475281623735, + "language_loss": 0.78147501, + "learning_rate": 2.719619732195215e-06, + "loss": 0.80367613, + "num_input_tokens_seen": 72118130, + "step": 3338, + "time_per_iteration": 2.4856388568878174 + }, + { + "auxiliary_loss_clip": 0.01144691, + "auxiliary_loss_mlp": 0.01026656, + "balance_loss_clip": 1.05097723, + "balance_loss_mlp": 1.01882362, + "epoch": 0.4014910118439247, + "flos": 24206593299840.0, + "grad_norm": 1.5229698028392253, + "language_loss": 0.72668022, + "learning_rate": 2.7188928771045377e-06, + "loss": 0.74839365, + "num_input_tokens_seen": 72139450, + "step": 3339, + "time_per_iteration": 2.5688064098358154 + }, + { + "auxiliary_loss_clip": 0.01138505, + "auxiliary_loss_mlp": 0.01028556, + "balance_loss_clip": 1.04984426, + "balance_loss_mlp": 1.02093291, + "epoch": 0.4016112547345638, + "flos": 26725080418560.0, + "grad_norm": 1.8002474030386153, + "language_loss": 0.79796803, + "learning_rate": 2.7181659129583815e-06, + "loss": 0.81963861, + "num_input_tokens_seen": 72159040, + "step": 3340, + "time_per_iteration": 2.5726680755615234 + }, + { + "auxiliary_loss_clip": 0.0114701, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.0453589, + "balance_loss_mlp": 1.0197382, + "epoch": 0.4017314976252029, + "flos": 21288025520640.0, + "grad_norm": 1.7388359306240395, + "language_loss": 0.75771153, + "learning_rate": 2.7174388398670276e-06, + "loss": 0.77946043, + "num_input_tokens_seen": 72178220, + "step": 3341, + "time_per_iteration": 2.530214548110962 + }, + { + "auxiliary_loss_clip": 0.01186283, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.05222499, + "balance_loss_mlp": 1.02652526, + "epoch": 0.401851740515842, + "flos": 25484690010240.0, + "grad_norm": 2.952699611839034, + "language_loss": 0.92222655, + "learning_rate": 2.716711657940773e-06, + "loss": 0.94443727, + "num_input_tokens_seen": 72199230, + "step": 3342, + "time_per_iteration": 2.481170415878296 + }, + { + "auxiliary_loss_clip": 0.01052325, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.01494741, + "balance_loss_mlp": 1.0016917, + "epoch": 0.4019719834064811, + "flos": 55395334978560.0, + "grad_norm": 0.8103047138159378, + "language_loss": 0.56454337, + "learning_rate": 2.7159843672899284e-06, + "loss": 0.58509266, + "num_input_tokens_seen": 72263430, + "step": 3343, + "time_per_iteration": 3.2316973209381104 + }, + { + "auxiliary_loss_clip": 0.01173996, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.056916, + "balance_loss_mlp": 1.02306914, + "epoch": 0.40209222629712016, + "flos": 18180100218240.0, + "grad_norm": 1.9041991478084044, + "language_loss": 0.81355417, + "learning_rate": 2.715256968024825e-06, + "loss": 0.83560884, + "num_input_tokens_seen": 72280505, + "step": 3344, + "time_per_iteration": 2.44474720954895 + }, + { + "auxiliary_loss_clip": 0.0116394, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.05348635, + "balance_loss_mlp": 1.02326047, + "epoch": 0.40221246918775927, + "flos": 25961009287680.0, + "grad_norm": 1.488175491406286, + "language_loss": 0.82278991, + "learning_rate": 2.7145294602558083e-06, + "loss": 0.84474498, + "num_input_tokens_seen": 72301215, + "step": 3345, + "time_per_iteration": 2.579402208328247 + }, + { + "auxiliary_loss_clip": 0.01172224, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.05363774, + "balance_loss_mlp": 1.01757193, + "epoch": 0.4023327120783984, + "flos": 33838912056960.0, + "grad_norm": 1.6961621906706548, + "language_loss": 0.70874989, + "learning_rate": 2.713801844093241e-06, + "loss": 0.73073626, + "num_input_tokens_seen": 72322365, + "step": 3346, + "time_per_iteration": 2.5805366039276123 + }, + { + "auxiliary_loss_clip": 0.01172387, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.05469322, + "balance_loss_mlp": 1.02582586, + "epoch": 0.40245295496903744, + "flos": 26900252069760.0, + "grad_norm": 4.329955084016142, + "language_loss": 0.88497341, + "learning_rate": 2.7130741196475014e-06, + "loss": 0.90703082, + "num_input_tokens_seen": 72340495, + "step": 3347, + "time_per_iteration": 2.5114622116088867 + }, + { + "auxiliary_loss_clip": 0.01163658, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0569824, + "balance_loss_mlp": 1.02243531, + "epoch": 0.40257319785967655, + "flos": 36902738436480.0, + "grad_norm": 1.7347704773764612, + "language_loss": 0.79523528, + "learning_rate": 2.7123462870289848e-06, + "loss": 0.81718373, + "num_input_tokens_seen": 72360545, + "step": 3348, + "time_per_iteration": 3.445404052734375 + }, + { + "auxiliary_loss_clip": 0.01158163, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.04989457, + "balance_loss_mlp": 1.0188446, + "epoch": 0.40269344075031566, + "flos": 24353180703360.0, + "grad_norm": 1.5570079333617244, + "language_loss": 0.81313539, + "learning_rate": 2.711618346348102e-06, + "loss": 0.83499038, + "num_input_tokens_seen": 72381070, + "step": 3349, + "time_per_iteration": 2.5344419479370117 + }, + { + "auxiliary_loss_clip": 0.01151823, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.05239129, + "balance_loss_mlp": 1.02533638, + "epoch": 0.4028136836409547, + "flos": 14389657614720.0, + "grad_norm": 1.7204618575155866, + "language_loss": 0.63533592, + "learning_rate": 2.7108902977152825e-06, + "loss": 0.65718997, + "num_input_tokens_seen": 72398970, + "step": 3350, + "time_per_iteration": 2.4805235862731934 + }, + { + "auxiliary_loss_clip": 0.01168873, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.05275476, + "balance_loss_mlp": 1.02146256, + "epoch": 0.4029339265315938, + "flos": 26136037284480.0, + "grad_norm": 2.2334036188005415, + "language_loss": 0.75367528, + "learning_rate": 2.7101621412409704e-06, + "loss": 0.77566183, + "num_input_tokens_seen": 72418455, + "step": 3351, + "time_per_iteration": 2.525186538696289 + }, + { + "auxiliary_loss_clip": 0.01186413, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.05487633, + "balance_loss_mlp": 1.02405632, + "epoch": 0.40305416942223293, + "flos": 23256325042560.0, + "grad_norm": 1.8354931310081248, + "language_loss": 0.85628814, + "learning_rate": 2.7094338770356256e-06, + "loss": 0.8784759, + "num_input_tokens_seen": 72437540, + "step": 3352, + "time_per_iteration": 3.27976393699646 + }, + { + "auxiliary_loss_clip": 0.01154016, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.05235922, + "balance_loss_mlp": 1.02269602, + "epoch": 0.403174412312872, + "flos": 27089645506560.0, + "grad_norm": 2.629118436396993, + "language_loss": 0.63947296, + "learning_rate": 2.708705505209726e-06, + "loss": 0.66132081, + "num_input_tokens_seen": 72458315, + "step": 3353, + "time_per_iteration": 2.5375888347625732 + }, + { + "auxiliary_loss_clip": 0.01122439, + "auxiliary_loss_mlp": 0.01026105, + "balance_loss_clip": 1.04585087, + "balance_loss_mlp": 1.01825547, + "epoch": 0.4032946552035111, + "flos": 21756336065280.0, + "grad_norm": 1.9407194038834914, + "language_loss": 0.91799098, + "learning_rate": 2.7079770258737646e-06, + "loss": 0.93947643, + "num_input_tokens_seen": 72476225, + "step": 3354, + "time_per_iteration": 2.582780599594116 + }, + { + "auxiliary_loss_clip": 0.01140371, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.04916334, + "balance_loss_mlp": 1.02134299, + "epoch": 0.4034148980941502, + "flos": 17343956448000.0, + "grad_norm": 3.0027357754337194, + "language_loss": 0.75169086, + "learning_rate": 2.707248439138251e-06, + "loss": 0.77339977, + "num_input_tokens_seen": 72492460, + "step": 3355, + "time_per_iteration": 3.221224069595337 + }, + { + "auxiliary_loss_clip": 0.01156202, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.05601764, + "balance_loss_mlp": 1.02368355, + "epoch": 0.40353514098478926, + "flos": 22017838055040.0, + "grad_norm": 1.7240781659763247, + "language_loss": 0.65557688, + "learning_rate": 2.7065197451137114e-06, + "loss": 0.67745423, + "num_input_tokens_seen": 72513840, + "step": 3356, + "time_per_iteration": 2.5538177490234375 + }, + { + "auxiliary_loss_clip": 0.01158999, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.0540297, + "balance_loss_mlp": 1.02254725, + "epoch": 0.4036553838754284, + "flos": 14246446089600.0, + "grad_norm": 2.2061355480707143, + "language_loss": 0.67547947, + "learning_rate": 2.7057909439106894e-06, + "loss": 0.69737804, + "num_input_tokens_seen": 72531695, + "step": 3357, + "time_per_iteration": 3.2442209720611572 + }, + { + "auxiliary_loss_clip": 0.01164787, + "auxiliary_loss_mlp": 0.00763371, + "balance_loss_clip": 1.05338669, + "balance_loss_mlp": 1.00018477, + "epoch": 0.40377562676606743, + "flos": 24790644443520.0, + "grad_norm": 2.709797235465548, + "language_loss": 0.78505218, + "learning_rate": 2.7050620356397417e-06, + "loss": 0.80433381, + "num_input_tokens_seen": 72550645, + "step": 3358, + "time_per_iteration": 2.5643270015716553 + }, + { + "auxiliary_loss_clip": 0.01184855, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.05713582, + "balance_loss_mlp": 1.01741993, + "epoch": 0.40389586965670654, + "flos": 24061226958720.0, + "grad_norm": 1.7007413928700912, + "language_loss": 0.72321671, + "learning_rate": 2.7043330204114437e-06, + "loss": 0.74531698, + "num_input_tokens_seen": 72569355, + "step": 3359, + "time_per_iteration": 2.469942092895508 + }, + { + "auxiliary_loss_clip": 0.01181486, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.05370665, + "balance_loss_mlp": 1.02064538, + "epoch": 0.40401611254734565, + "flos": 16399613934720.0, + "grad_norm": 2.0329688455435386, + "language_loss": 0.85432428, + "learning_rate": 2.7036038983363862e-06, + "loss": 0.87642622, + "num_input_tokens_seen": 72585960, + "step": 3360, + "time_per_iteration": 2.3977837562561035 + }, + { + "auxiliary_loss_clip": 0.01168561, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.05547059, + "balance_loss_mlp": 1.02231503, + "epoch": 0.4041363554379847, + "flos": 23988220565760.0, + "grad_norm": 1.7792993108996924, + "language_loss": 0.84526694, + "learning_rate": 2.702874669525177e-06, + "loss": 0.86725038, + "num_input_tokens_seen": 72604440, + "step": 3361, + "time_per_iteration": 2.482865571975708 + }, + { + "auxiliary_loss_clip": 0.01149164, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.05765474, + "balance_loss_mlp": 1.0224092, + "epoch": 0.4042565983286238, + "flos": 28401964899840.0, + "grad_norm": 2.085179581183854, + "language_loss": 0.69770318, + "learning_rate": 2.7021453340884394e-06, + "loss": 0.71950603, + "num_input_tokens_seen": 72622165, + "step": 3362, + "time_per_iteration": 2.5826261043548584 + }, + { + "auxiliary_loss_clip": 0.01149949, + "auxiliary_loss_mlp": 0.00762972, + "balance_loss_clip": 1.05432153, + "balance_loss_mlp": 1.00017297, + "epoch": 0.40437684121926293, + "flos": 17710963660800.0, + "grad_norm": 2.270351245482001, + "language_loss": 0.73048806, + "learning_rate": 2.7014158921368125e-06, + "loss": 0.74961734, + "num_input_tokens_seen": 72640490, + "step": 3363, + "time_per_iteration": 2.470259666442871 + }, + { + "auxiliary_loss_clip": 0.01187606, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.05740142, + "balance_loss_mlp": 1.02351713, + "epoch": 0.404497084109902, + "flos": 24018959629440.0, + "grad_norm": 2.6346032838777114, + "language_loss": 0.86016029, + "learning_rate": 2.700686343780953e-06, + "loss": 0.88235652, + "num_input_tokens_seen": 72660360, + "step": 3364, + "time_per_iteration": 2.4738242626190186 + }, + { + "auxiliary_loss_clip": 0.0115954, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.05222845, + "balance_loss_mlp": 1.01986027, + "epoch": 0.4046173270005411, + "flos": 22929861306240.0, + "grad_norm": 1.5938177062569232, + "language_loss": 0.88310719, + "learning_rate": 2.699956689131532e-06, + "loss": 0.90498561, + "num_input_tokens_seen": 72680345, + "step": 3365, + "time_per_iteration": 2.5097124576568604 + }, + { + "auxiliary_loss_clip": 0.01162785, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.0553143, + "balance_loss_mlp": 1.02044165, + "epoch": 0.4047375698911802, + "flos": 20668135582080.0, + "grad_norm": 2.7726547696088293, + "language_loss": 0.84795403, + "learning_rate": 2.699226928299238e-06, + "loss": 0.86987019, + "num_input_tokens_seen": 72698365, + "step": 3366, + "time_per_iteration": 2.494499444961548 + }, + { + "auxiliary_loss_clip": 0.01176626, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.05664659, + "balance_loss_mlp": 1.02492237, + "epoch": 0.40485781278181926, + "flos": 28912865996160.0, + "grad_norm": 2.337205935409372, + "language_loss": 0.79043329, + "learning_rate": 2.698497061394774e-06, + "loss": 0.81252658, + "num_input_tokens_seen": 72716850, + "step": 3367, + "time_per_iteration": 2.506425380706787 + }, + { + "auxiliary_loss_clip": 0.01152816, + "auxiliary_loss_mlp": 0.00762704, + "balance_loss_clip": 1.05440688, + "balance_loss_mlp": 1.00016999, + "epoch": 0.40497805567245837, + "flos": 23148377694720.0, + "grad_norm": 3.083649654128188, + "language_loss": 0.80556291, + "learning_rate": 2.6977670885288627e-06, + "loss": 0.82471812, + "num_input_tokens_seen": 72738250, + "step": 3368, + "time_per_iteration": 2.554211139678955 + }, + { + "auxiliary_loss_clip": 0.01147126, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.05112576, + "balance_loss_mlp": 1.02369559, + "epoch": 0.4050982985630975, + "flos": 16289404030080.0, + "grad_norm": 1.6620764289472865, + "language_loss": 0.75343096, + "learning_rate": 2.6970370098122378e-06, + "loss": 0.77522385, + "num_input_tokens_seen": 72755235, + "step": 3369, + "time_per_iteration": 2.4633595943450928 + }, + { + "auxiliary_loss_clip": 0.01186289, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.05559802, + "balance_loss_mlp": 1.0200001, + "epoch": 0.40521854145373654, + "flos": 34459484353920.0, + "grad_norm": 1.5045027977382917, + "language_loss": 0.86840677, + "learning_rate": 2.6963068253556535e-06, + "loss": 0.8905468, + "num_input_tokens_seen": 72776620, + "step": 3370, + "time_per_iteration": 2.544238328933716 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.05515289, + "balance_loss_mlp": 1.02268815, + "epoch": 0.40533878434437565, + "flos": 25331099454720.0, + "grad_norm": 2.1361957414435926, + "language_loss": 0.85629225, + "learning_rate": 2.6955765352698763e-06, + "loss": 0.87842023, + "num_input_tokens_seen": 72796765, + "step": 3371, + "time_per_iteration": 2.522141218185425 + }, + { + "auxiliary_loss_clip": 0.01190627, + "auxiliary_loss_mlp": 0.01027998, + "balance_loss_clip": 1.05681491, + "balance_loss_mlp": 1.01903987, + "epoch": 0.40545902723501476, + "flos": 15012061505280.0, + "grad_norm": 2.0548145858599893, + "language_loss": 0.73062837, + "learning_rate": 2.6948461396656923e-06, + "loss": 0.75281459, + "num_input_tokens_seen": 72814175, + "step": 3372, + "time_per_iteration": 2.4054653644561768 + }, + { + "auxiliary_loss_clip": 0.01179153, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.05636632, + "balance_loss_mlp": 1.02388716, + "epoch": 0.4055792701256538, + "flos": 25521103422720.0, + "grad_norm": 2.447431264885515, + "language_loss": 0.74588716, + "learning_rate": 2.6941156386539013e-06, + "loss": 0.7680006, + "num_input_tokens_seen": 72834125, + "step": 3373, + "time_per_iteration": 2.485348701477051 + }, + { + "auxiliary_loss_clip": 0.01158329, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.05734408, + "balance_loss_mlp": 1.02516556, + "epoch": 0.4056995130162929, + "flos": 19574583972480.0, + "grad_norm": 2.137650033651343, + "language_loss": 0.80775881, + "learning_rate": 2.6933850323453203e-06, + "loss": 0.82967764, + "num_input_tokens_seen": 72852570, + "step": 3374, + "time_per_iteration": 2.4789068698883057 + }, + { + "auxiliary_loss_clip": 0.01187417, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.05851412, + "balance_loss_mlp": 1.01972985, + "epoch": 0.405819755906932, + "flos": 15413794191360.0, + "grad_norm": 2.007796301734701, + "language_loss": 0.74573904, + "learning_rate": 2.6926543208507806e-06, + "loss": 0.76789308, + "num_input_tokens_seen": 72871250, + "step": 3375, + "time_per_iteration": 3.2331912517547607 + }, + { + "auxiliary_loss_clip": 0.01172506, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.05546772, + "balance_loss_mlp": 1.01922894, + "epoch": 0.4059399987975711, + "flos": 21433930565760.0, + "grad_norm": 1.9151486571471033, + "language_loss": 0.8007406, + "learning_rate": 2.6919235042811316e-06, + "loss": 0.82274562, + "num_input_tokens_seen": 72890035, + "step": 3376, + "time_per_iteration": 2.4446563720703125 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.05294466, + "balance_loss_mlp": 1.02448356, + "epoch": 0.4060602416882102, + "flos": 25556942217600.0, + "grad_norm": 3.089289128033694, + "language_loss": 0.76336491, + "learning_rate": 2.691192582747237e-06, + "loss": 0.78513372, + "num_input_tokens_seen": 72909665, + "step": 3377, + "time_per_iteration": 2.5499448776245117 + }, + { + "auxiliary_loss_clip": 0.01191227, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.05952299, + "balance_loss_mlp": 1.01813221, + "epoch": 0.40618048457884925, + "flos": 23766759262080.0, + "grad_norm": 1.7072330383906646, + "language_loss": 0.73954666, + "learning_rate": 2.6904615563599765e-06, + "loss": 0.76171815, + "num_input_tokens_seen": 72929465, + "step": 3378, + "time_per_iteration": 2.4370408058166504 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.0499233, + "balance_loss_mlp": 1.0176599, + "epoch": 0.40630072746948837, + "flos": 17639681120640.0, + "grad_norm": 2.5686773493979973, + "language_loss": 0.83398104, + "learning_rate": 2.6897304252302477e-06, + "loss": 0.85561949, + "num_input_tokens_seen": 72946785, + "step": 3379, + "time_per_iteration": 3.4193224906921387 + }, + { + "auxiliary_loss_clip": 0.01047831, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 1.01428914, + "balance_loss_mlp": 0.99953955, + "epoch": 0.4064209703601275, + "flos": 60836053063680.0, + "grad_norm": 0.7849656510100202, + "language_loss": 0.54760706, + "learning_rate": 2.688999189468962e-06, + "loss": 0.56809115, + "num_input_tokens_seen": 73003215, + "step": 3380, + "time_per_iteration": 2.9865474700927734 + }, + { + "auxiliary_loss_clip": 0.01172775, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.0563426, + "balance_loss_mlp": 1.02579331, + "epoch": 0.40654121325076653, + "flos": 24024346669440.0, + "grad_norm": 2.5553775502011926, + "language_loss": 0.76012647, + "learning_rate": 2.6882678491870464e-06, + "loss": 0.78218997, + "num_input_tokens_seen": 73023650, + "step": 3381, + "time_per_iteration": 3.308553695678711 + }, + { + "auxiliary_loss_clip": 0.01176911, + "auxiliary_loss_mlp": 0.01024662, + "balance_loss_clip": 1.05644083, + "balance_loss_mlp": 1.01563787, + "epoch": 0.40666145614140564, + "flos": 27344252085120.0, + "grad_norm": 1.6598282134544586, + "language_loss": 0.71465707, + "learning_rate": 2.6875364044954453e-06, + "loss": 0.73667282, + "num_input_tokens_seen": 73043880, + "step": 3382, + "time_per_iteration": 2.5174951553344727 + }, + { + "auxiliary_loss_clip": 0.01153549, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.0483793, + "balance_loss_mlp": 1.01939476, + "epoch": 0.40678169903204475, + "flos": 26176724415360.0, + "grad_norm": 1.5373131935642868, + "language_loss": 0.82119489, + "learning_rate": 2.6868048555051185e-06, + "loss": 0.84300327, + "num_input_tokens_seen": 73065410, + "step": 3383, + "time_per_iteration": 2.553271770477295 + }, + { + "auxiliary_loss_clip": 0.01164721, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.05202174, + "balance_loss_mlp": 1.02447939, + "epoch": 0.4069019419226838, + "flos": 28622420622720.0, + "grad_norm": 2.7480694585289087, + "language_loss": 0.85636979, + "learning_rate": 2.686073202327041e-06, + "loss": 0.87834615, + "num_input_tokens_seen": 73084410, + "step": 3384, + "time_per_iteration": 3.3438169956207275 + }, + { + "auxiliary_loss_clip": 0.01147696, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.04973865, + "balance_loss_mlp": 1.02645969, + "epoch": 0.4070221848133229, + "flos": 25229006023680.0, + "grad_norm": 1.671546420932362, + "language_loss": 0.73064798, + "learning_rate": 2.6853414450722043e-06, + "loss": 0.75247675, + "num_input_tokens_seen": 73104075, + "step": 3385, + "time_per_iteration": 2.544618844985962 + }, + { + "auxiliary_loss_clip": 0.01171394, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.05489922, + "balance_loss_mlp": 1.01890278, + "epoch": 0.40714242770396203, + "flos": 18405224709120.0, + "grad_norm": 1.7653393830609239, + "language_loss": 0.85523063, + "learning_rate": 2.684609583851616e-06, + "loss": 0.87721825, + "num_input_tokens_seen": 73122250, + "step": 3386, + "time_per_iteration": 2.448624849319458 + }, + { + "auxiliary_loss_clip": 0.01130067, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.05005169, + "balance_loss_mlp": 1.02222037, + "epoch": 0.4072626705946011, + "flos": 30228920403840.0, + "grad_norm": 1.5110664228785189, + "language_loss": 0.80705535, + "learning_rate": 2.683877618776297e-06, + "loss": 0.82865882, + "num_input_tokens_seen": 73144505, + "step": 3387, + "time_per_iteration": 2.6386239528656006 + }, + { + "auxiliary_loss_clip": 0.01151296, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.04902768, + "balance_loss_mlp": 1.02413559, + "epoch": 0.4073829134852402, + "flos": 21834549930240.0, + "grad_norm": 2.329699160904908, + "language_loss": 0.73962247, + "learning_rate": 2.6831455499572876e-06, + "loss": 0.76146668, + "num_input_tokens_seen": 73162440, + "step": 3388, + "time_per_iteration": 2.504287004470825 + }, + { + "auxiliary_loss_clip": 0.01189, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.05571914, + "balance_loss_mlp": 1.02000022, + "epoch": 0.40750315637587925, + "flos": 25260211964160.0, + "grad_norm": 1.7834216739595714, + "language_loss": 0.77691239, + "learning_rate": 2.682413377505641e-06, + "loss": 0.79908836, + "num_input_tokens_seen": 73181245, + "step": 3389, + "time_per_iteration": 2.4684269428253174 + }, + { + "auxiliary_loss_clip": 0.01173233, + "auxiliary_loss_mlp": 0.01028147, + "balance_loss_clip": 1.0538801, + "balance_loss_mlp": 1.01969528, + "epoch": 0.40762339926651836, + "flos": 19712767593600.0, + "grad_norm": 2.2829060372494556, + "language_loss": 0.7656498, + "learning_rate": 2.6816811015324284e-06, + "loss": 0.78766358, + "num_input_tokens_seen": 73199295, + "step": 3390, + "time_per_iteration": 2.4656755924224854 + }, + { + "auxiliary_loss_clip": 0.01080487, + "auxiliary_loss_mlp": 0.01000646, + "balance_loss_clip": 1.0143044, + "balance_loss_mlp": 0.99968034, + "epoch": 0.40774364215715747, + "flos": 71449307314560.0, + "grad_norm": 0.7269881696067815, + "language_loss": 0.56734681, + "learning_rate": 2.6809487221487343e-06, + "loss": 0.58815813, + "num_input_tokens_seen": 73258780, + "step": 3391, + "time_per_iteration": 2.9244205951690674 + }, + { + "auxiliary_loss_clip": 0.01164119, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.05148816, + "balance_loss_mlp": 1.01450348, + "epoch": 0.4078638850477965, + "flos": 15084134144640.0, + "grad_norm": 2.7218408774333094, + "language_loss": 0.81857866, + "learning_rate": 2.6802162394656605e-06, + "loss": 0.84044933, + "num_input_tokens_seen": 73275490, + "step": 3392, + "time_per_iteration": 2.4352774620056152 + }, + { + "auxiliary_loss_clip": 0.01153499, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.04879951, + "balance_loss_mlp": 1.0229224, + "epoch": 0.40798412793843564, + "flos": 23842890138240.0, + "grad_norm": 1.6492507823067226, + "language_loss": 0.71605313, + "learning_rate": 2.679483653594324e-06, + "loss": 0.73789752, + "num_input_tokens_seen": 73297260, + "step": 3393, + "time_per_iteration": 2.553133010864258 + }, + { + "auxiliary_loss_clip": 0.01176084, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.05544233, + "balance_loss_mlp": 1.02255297, + "epoch": 0.40810437082907475, + "flos": 21065774117760.0, + "grad_norm": 4.107292579256087, + "language_loss": 0.75852883, + "learning_rate": 2.678750964645857e-06, + "loss": 0.78059822, + "num_input_tokens_seen": 73316340, + "step": 3394, + "time_per_iteration": 2.469615936279297 + }, + { + "auxiliary_loss_clip": 0.0118087, + "auxiliary_loss_mlp": 0.01033674, + "balance_loss_clip": 1.06152833, + "balance_loss_mlp": 1.02521014, + "epoch": 0.4082246137197138, + "flos": 11321377948800.0, + "grad_norm": 2.4602551635508556, + "language_loss": 0.8330512, + "learning_rate": 2.6780181727314094e-06, + "loss": 0.8551966, + "num_input_tokens_seen": 73331245, + "step": 3395, + "time_per_iteration": 2.4695208072662354 + }, + { + "auxiliary_loss_clip": 0.01146659, + "auxiliary_loss_mlp": 0.00762777, + "balance_loss_clip": 1.05050862, + "balance_loss_mlp": 1.00010419, + "epoch": 0.4083448566103529, + "flos": 19062569554560.0, + "grad_norm": 1.888437769656481, + "language_loss": 0.78091037, + "learning_rate": 2.6772852779621435e-06, + "loss": 0.80000478, + "num_input_tokens_seen": 73349105, + "step": 3396, + "time_per_iteration": 2.50990891456604 + }, + { + "auxiliary_loss_clip": 0.01170266, + "auxiliary_loss_mlp": 0.00762535, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.00008106, + "epoch": 0.408465099500992, + "flos": 23550254035200.0, + "grad_norm": 2.3388074612949405, + "language_loss": 0.86841989, + "learning_rate": 2.676552280449239e-06, + "loss": 0.88774788, + "num_input_tokens_seen": 73368990, + "step": 3397, + "time_per_iteration": 2.494401454925537 + }, + { + "auxiliary_loss_clip": 0.01163707, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.05220544, + "balance_loss_mlp": 1.02306247, + "epoch": 0.4085853423916311, + "flos": 12750012558720.0, + "grad_norm": 2.2422508009404467, + "language_loss": 0.76202452, + "learning_rate": 2.6758191803038917e-06, + "loss": 0.78398085, + "num_input_tokens_seen": 73387485, + "step": 3398, + "time_per_iteration": 2.436962366104126 + }, + { + "auxiliary_loss_clip": 0.01107744, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.04918516, + "balance_loss_mlp": 1.02071166, + "epoch": 0.4087055852822702, + "flos": 24353072962560.0, + "grad_norm": 1.6875299277262752, + "language_loss": 0.82741207, + "learning_rate": 2.6750859776373125e-06, + "loss": 0.84878027, + "num_input_tokens_seen": 73406940, + "step": 3399, + "time_per_iteration": 2.617870569229126 + }, + { + "auxiliary_loss_clip": 0.01029972, + "auxiliary_loss_mlp": 0.01006581, + "balance_loss_clip": 1.01360369, + "balance_loss_mlp": 1.00530553, + "epoch": 0.4088258281729093, + "flos": 66387950720640.0, + "grad_norm": 0.7750689225180801, + "language_loss": 0.604289, + "learning_rate": 2.674352672560727e-06, + "loss": 0.62465453, + "num_input_tokens_seen": 73468385, + "step": 3400, + "time_per_iteration": 3.161284923553467 + }, + { + "auxiliary_loss_clip": 0.011438, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.05002904, + "balance_loss_mlp": 1.01752019, + "epoch": 0.40894607106354836, + "flos": 20449260057600.0, + "grad_norm": 1.896731507224513, + "language_loss": 0.76985514, + "learning_rate": 2.673619265185377e-06, + "loss": 0.79155612, + "num_input_tokens_seen": 73488225, + "step": 3401, + "time_per_iteration": 2.543379306793213 + }, + { + "auxiliary_loss_clip": 0.01175115, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.05433238, + "balance_loss_mlp": 1.02643752, + "epoch": 0.40906631395418747, + "flos": 27053627143680.0, + "grad_norm": 1.6220312349977828, + "language_loss": 0.78030384, + "learning_rate": 2.672885755622521e-06, + "loss": 0.80240524, + "num_input_tokens_seen": 73510640, + "step": 3402, + "time_per_iteration": 3.3825464248657227 + }, + { + "auxiliary_loss_clip": 0.01128153, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.04652154, + "balance_loss_mlp": 1.02126062, + "epoch": 0.4091865568448266, + "flos": 25484151306240.0, + "grad_norm": 2.122783565191288, + "language_loss": 0.70295417, + "learning_rate": 2.67215214398343e-06, + "loss": 0.72453237, + "num_input_tokens_seen": 73530655, + "step": 3403, + "time_per_iteration": 2.606388807296753 + }, + { + "auxiliary_loss_clip": 0.01131998, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.04633403, + "balance_loss_mlp": 1.02207541, + "epoch": 0.40930679973546563, + "flos": 28657864368000.0, + "grad_norm": 2.4252370637363017, + "language_loss": 0.78026772, + "learning_rate": 2.671418430379393e-06, + "loss": 0.80189872, + "num_input_tokens_seen": 73549340, + "step": 3404, + "time_per_iteration": 2.6188619136810303 + }, + { + "auxiliary_loss_clip": 0.01186454, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.05508924, + "balance_loss_mlp": 1.01846433, + "epoch": 0.40942704262610474, + "flos": 20886292834560.0, + "grad_norm": 3.0820917807794084, + "language_loss": 0.82934201, + "learning_rate": 2.670684614921715e-06, + "loss": 0.85147572, + "num_input_tokens_seen": 73568315, + "step": 3405, + "time_per_iteration": 2.432753324508667 + }, + { + "auxiliary_loss_clip": 0.01159272, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.0510478, + "balance_loss_mlp": 1.02314281, + "epoch": 0.4095472855167438, + "flos": 21618080616960.0, + "grad_norm": 3.0741149815124618, + "language_loss": 0.69075906, + "learning_rate": 2.6699506977217128e-06, + "loss": 0.71266997, + "num_input_tokens_seen": 73588490, + "step": 3406, + "time_per_iteration": 3.3827433586120605 + }, + { + "auxiliary_loss_clip": 0.01171514, + "auxiliary_loss_mlp": 0.01026027, + "balance_loss_clip": 1.05772305, + "balance_loss_mlp": 1.01787865, + "epoch": 0.4096675284073829, + "flos": 27926112499200.0, + "grad_norm": 2.0136362758093114, + "language_loss": 0.69915766, + "learning_rate": 2.6692166788907233e-06, + "loss": 0.72113311, + "num_input_tokens_seen": 73608685, + "step": 3407, + "time_per_iteration": 2.5616419315338135 + }, + { + "auxiliary_loss_clip": 0.01159127, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.05208826, + "balance_loss_mlp": 1.022735, + "epoch": 0.409787771298022, + "flos": 19206607092480.0, + "grad_norm": 1.8887130947309867, + "language_loss": 0.76980293, + "learning_rate": 2.6684825585400957e-06, + "loss": 0.79171073, + "num_input_tokens_seen": 73627630, + "step": 3408, + "time_per_iteration": 3.2694413661956787 + }, + { + "auxiliary_loss_clip": 0.01053915, + "auxiliary_loss_mlp": 0.01001098, + "balance_loss_clip": 1.01382983, + "balance_loss_mlp": 0.99999523, + "epoch": 0.4099080141886611, + "flos": 59269234832640.0, + "grad_norm": 0.8230670138384596, + "language_loss": 0.65103191, + "learning_rate": 2.6677483367811947e-06, + "loss": 0.67158204, + "num_input_tokens_seen": 73687670, + "step": 3409, + "time_per_iteration": 3.1678109169006348 + }, + { + "auxiliary_loss_clip": 0.01173871, + "auxiliary_loss_mlp": 0.01025946, + "balance_loss_clip": 1.05198944, + "balance_loss_mlp": 1.0180124, + "epoch": 0.4100282570793002, + "flos": 21906443001600.0, + "grad_norm": 1.6943569846300655, + "language_loss": 0.75456792, + "learning_rate": 2.6670140137254028e-06, + "loss": 0.77656609, + "num_input_tokens_seen": 73707145, + "step": 3410, + "time_per_iteration": 3.225379705429077 + }, + { + "auxiliary_loss_clip": 0.01125115, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.04895771, + "balance_loss_mlp": 1.01903653, + "epoch": 0.4101484999699393, + "flos": 18551596631040.0, + "grad_norm": 2.175898670822533, + "language_loss": 0.89666653, + "learning_rate": 2.666279589484115e-06, + "loss": 0.91819024, + "num_input_tokens_seen": 73725045, + "step": 3411, + "time_per_iteration": 2.541153907775879 + }, + { + "auxiliary_loss_clip": 0.01127034, + "auxiliary_loss_mlp": 0.01024678, + "balance_loss_clip": 1.04674268, + "balance_loss_mlp": 1.01692891, + "epoch": 0.41026874286057835, + "flos": 19094529680640.0, + "grad_norm": 1.9883902105830888, + "language_loss": 0.80995059, + "learning_rate": 2.6655450641687435e-06, + "loss": 0.83146769, + "num_input_tokens_seen": 73742610, + "step": 3412, + "time_per_iteration": 2.542750597000122 + }, + { + "auxiliary_loss_clip": 0.01187967, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.05880988, + "balance_loss_mlp": 1.02080488, + "epoch": 0.41038898575121746, + "flos": 31209568588800.0, + "grad_norm": 1.64470787294491, + "language_loss": 0.69458604, + "learning_rate": 2.664810437890715e-06, + "loss": 0.71675819, + "num_input_tokens_seen": 73764280, + "step": 3413, + "time_per_iteration": 2.5264055728912354 + }, + { + "auxiliary_loss_clip": 0.01109018, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.0532831, + "balance_loss_mlp": 1.02134216, + "epoch": 0.41050922864185657, + "flos": 14355865895040.0, + "grad_norm": 4.289189759480733, + "language_loss": 0.79475296, + "learning_rate": 2.6640757107614714e-06, + "loss": 0.81613034, + "num_input_tokens_seen": 73782375, + "step": 3414, + "time_per_iteration": 2.571340799331665 + }, + { + "auxiliary_loss_clip": 0.0113986, + "auxiliary_loss_mlp": 0.01026666, + "balance_loss_clip": 1.05248928, + "balance_loss_mlp": 1.01807749, + "epoch": 0.4106294715324956, + "flos": 30956290813440.0, + "grad_norm": 2.09267630270332, + "language_loss": 0.68926632, + "learning_rate": 2.6633408828924697e-06, + "loss": 0.7109316, + "num_input_tokens_seen": 73801240, + "step": 3415, + "time_per_iteration": 2.6018478870391846 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.05497253, + "balance_loss_mlp": 1.02856743, + "epoch": 0.41074971442313474, + "flos": 24457321209600.0, + "grad_norm": 1.5546567530053277, + "language_loss": 0.7016114, + "learning_rate": 2.662605954395185e-06, + "loss": 0.72350681, + "num_input_tokens_seen": 73821200, + "step": 3416, + "time_per_iteration": 2.5898351669311523 + }, + { + "auxiliary_loss_clip": 0.01173896, + "auxiliary_loss_mlp": 0.01025115, + "balance_loss_clip": 1.0532558, + "balance_loss_mlp": 1.01743186, + "epoch": 0.41086995731377385, + "flos": 21542991235200.0, + "grad_norm": 1.7748157847477455, + "language_loss": 0.83809614, + "learning_rate": 2.6618709253811027e-06, + "loss": 0.86008626, + "num_input_tokens_seen": 73840655, + "step": 3417, + "time_per_iteration": 2.5107195377349854 + }, + { + "auxiliary_loss_clip": 0.01185247, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.05799937, + "balance_loss_mlp": 1.01951528, + "epoch": 0.4109902002044129, + "flos": 20702753314560.0, + "grad_norm": 1.6345945953980663, + "language_loss": 0.87304175, + "learning_rate": 2.6611357959617277e-06, + "loss": 0.89515877, + "num_input_tokens_seen": 73860275, + "step": 3418, + "time_per_iteration": 2.4438281059265137 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.04998684, + "balance_loss_mlp": 1.02713096, + "epoch": 0.411110443095052, + "flos": 18179992477440.0, + "grad_norm": 1.7857831501145531, + "language_loss": 0.90923345, + "learning_rate": 2.660400566248578e-06, + "loss": 0.93097949, + "num_input_tokens_seen": 73878400, + "step": 3419, + "time_per_iteration": 2.5350852012634277 + }, + { + "auxiliary_loss_clip": 0.01144094, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.04989147, + "balance_loss_mlp": 1.0232445, + "epoch": 0.41123068598569107, + "flos": 14575244209920.0, + "grad_norm": 2.2793699006355044, + "language_loss": 0.66624498, + "learning_rate": 2.6596652363531876e-06, + "loss": 0.68801212, + "num_input_tokens_seen": 73894275, + "step": 3420, + "time_per_iteration": 2.5832343101501465 + }, + { + "auxiliary_loss_clip": 0.01184643, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.05510128, + "balance_loss_mlp": 1.01871634, + "epoch": 0.4113509288763302, + "flos": 21177995184000.0, + "grad_norm": 1.4748049450188518, + "language_loss": 0.78456891, + "learning_rate": 2.6589298063871055e-06, + "loss": 0.80668163, + "num_input_tokens_seen": 73914450, + "step": 3421, + "time_per_iteration": 2.444382429122925 + }, + { + "auxiliary_loss_clip": 0.0118418, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.05450559, + "balance_loss_mlp": 1.01916766, + "epoch": 0.4114711717669693, + "flos": 18442212739200.0, + "grad_norm": 1.898926543113717, + "language_loss": 0.69418907, + "learning_rate": 2.658194276461895e-06, + "loss": 0.71630895, + "num_input_tokens_seen": 73932375, + "step": 3422, + "time_per_iteration": 2.4294989109039307 + }, + { + "auxiliary_loss_clip": 0.01159555, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.05047846, + "balance_loss_mlp": 1.01993859, + "epoch": 0.41159141465760835, + "flos": 27233395735680.0, + "grad_norm": 2.1380857066121157, + "language_loss": 0.67110312, + "learning_rate": 2.6574586466891368e-06, + "loss": 0.69298863, + "num_input_tokens_seen": 73952850, + "step": 3423, + "time_per_iteration": 2.564086675643921 + }, + { + "auxiliary_loss_clip": 0.01155646, + "auxiliary_loss_mlp": 0.00762543, + "balance_loss_clip": 1.05102348, + "balance_loss_mlp": 1.00011778, + "epoch": 0.41171165754824746, + "flos": 20006876154240.0, + "grad_norm": 3.1246470547510246, + "language_loss": 0.64620644, + "learning_rate": 2.6567229171804247e-06, + "loss": 0.66538835, + "num_input_tokens_seen": 73970735, + "step": 3424, + "time_per_iteration": 2.495319128036499 + }, + { + "auxiliary_loss_clip": 0.01149574, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.0475421, + "balance_loss_mlp": 1.02885866, + "epoch": 0.41183190043888657, + "flos": 18004318035840.0, + "grad_norm": 2.5008071495225725, + "language_loss": 0.87688345, + "learning_rate": 2.655987088047368e-06, + "loss": 0.89876032, + "num_input_tokens_seen": 73989080, + "step": 3425, + "time_per_iteration": 2.4999303817749023 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.04946041, + "balance_loss_mlp": 1.02320433, + "epoch": 0.4119521433295256, + "flos": 27163370171520.0, + "grad_norm": 2.463150626199949, + "language_loss": 0.78562689, + "learning_rate": 2.6552511594015912e-06, + "loss": 0.80745339, + "num_input_tokens_seen": 74009470, + "step": 3426, + "time_per_iteration": 2.5535061359405518 + }, + { + "auxiliary_loss_clip": 0.01155337, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.04829454, + "balance_loss_mlp": 1.01948524, + "epoch": 0.41207238622016473, + "flos": 15122020014720.0, + "grad_norm": 1.915374764288529, + "language_loss": 0.85259998, + "learning_rate": 2.654515131354735e-06, + "loss": 0.874439, + "num_input_tokens_seen": 74027735, + "step": 3427, + "time_per_iteration": 2.483039379119873 + }, + { + "auxiliary_loss_clip": 0.01143483, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.05152774, + "balance_loss_mlp": 1.02076781, + "epoch": 0.41219262911080384, + "flos": 27052872958080.0, + "grad_norm": 2.4105201343755893, + "language_loss": 0.84782207, + "learning_rate": 2.653779004018453e-06, + "loss": 0.8695389, + "num_input_tokens_seen": 74048300, + "step": 3428, + "time_per_iteration": 3.4264934062957764 + }, + { + "auxiliary_loss_clip": 0.01149484, + "auxiliary_loss_mlp": 0.01022603, + "balance_loss_clip": 1.05072546, + "balance_loss_mlp": 1.01456213, + "epoch": 0.4123128720014429, + "flos": 24686360282880.0, + "grad_norm": 1.8627355113254442, + "language_loss": 0.82521957, + "learning_rate": 2.653042777504417e-06, + "loss": 0.8469404, + "num_input_tokens_seen": 74070890, + "step": 3429, + "time_per_iteration": 2.5591697692871094 + }, + { + "auxiliary_loss_clip": 0.01165088, + "auxiliary_loss_mlp": 0.01025107, + "balance_loss_clip": 1.05193996, + "balance_loss_mlp": 1.01668453, + "epoch": 0.412433114892082, + "flos": 26244774731520.0, + "grad_norm": 1.7746401224407709, + "language_loss": 0.79560387, + "learning_rate": 2.6523064519243105e-06, + "loss": 0.81750584, + "num_input_tokens_seen": 74090460, + "step": 3430, + "time_per_iteration": 2.5351781845092773 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.05414796, + "balance_loss_mlp": 1.024647, + "epoch": 0.4125533577827211, + "flos": 21361031913600.0, + "grad_norm": 2.4688279827538633, + "language_loss": 0.79005921, + "learning_rate": 2.6515700273898333e-06, + "loss": 0.81211209, + "num_input_tokens_seen": 74108335, + "step": 3431, + "time_per_iteration": 2.4668734073638916 + }, + { + "auxiliary_loss_clip": 0.01146948, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.05278516, + "balance_loss_mlp": 1.02311659, + "epoch": 0.4126736006733602, + "flos": 26067556005120.0, + "grad_norm": 1.8607911076876527, + "language_loss": 0.68730193, + "learning_rate": 2.6508335040127018e-06, + "loss": 0.70909238, + "num_input_tokens_seen": 74128030, + "step": 3432, + "time_per_iteration": 3.360414981842041 + }, + { + "auxiliary_loss_clip": 0.01175638, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.05609465, + "balance_loss_mlp": 1.02158082, + "epoch": 0.4127938435639993, + "flos": 25666146541440.0, + "grad_norm": 1.9936524485733829, + "language_loss": 0.77026403, + "learning_rate": 2.6500968819046446e-06, + "loss": 0.79231644, + "num_input_tokens_seen": 74148330, + "step": 3433, + "time_per_iteration": 2.518282413482666 + }, + { + "auxiliary_loss_clip": 0.01128342, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.04418397, + "balance_loss_mlp": 1.02111101, + "epoch": 0.4129140864546384, + "flos": 17995914253440.0, + "grad_norm": 2.3148799500342108, + "language_loss": 0.58738357, + "learning_rate": 2.649360161177408e-06, + "loss": 0.60896146, + "num_input_tokens_seen": 74163390, + "step": 3434, + "time_per_iteration": 2.4858314990997314 + }, + { + "auxiliary_loss_clip": 0.01178267, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.05378246, + "balance_loss_mlp": 1.02030063, + "epoch": 0.41303432934527745, + "flos": 23732895715200.0, + "grad_norm": 2.182380081991096, + "language_loss": 0.73062766, + "learning_rate": 2.6486233419427504e-06, + "loss": 0.7526952, + "num_input_tokens_seen": 74183205, + "step": 3435, + "time_per_iteration": 3.205916166305542 + }, + { + "auxiliary_loss_clip": 0.01133665, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.04971623, + "balance_loss_mlp": 1.0169239, + "epoch": 0.41315457223591656, + "flos": 19755286318080.0, + "grad_norm": 2.440945576826226, + "language_loss": 0.75222224, + "learning_rate": 2.6478864243124484e-06, + "loss": 0.77381611, + "num_input_tokens_seen": 74202870, + "step": 3436, + "time_per_iteration": 2.5269200801849365 + }, + { + "auxiliary_loss_clip": 0.01170382, + "auxiliary_loss_mlp": 0.01022709, + "balance_loss_clip": 1.05038667, + "balance_loss_mlp": 1.01523471, + "epoch": 0.4132748151265556, + "flos": 20923316778240.0, + "grad_norm": 3.817682558471954, + "language_loss": 0.8523711, + "learning_rate": 2.6471494083982903e-06, + "loss": 0.87430197, + "num_input_tokens_seen": 74222255, + "step": 3437, + "time_per_iteration": 3.264099597930908 + }, + { + "auxiliary_loss_clip": 0.01144507, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.0489893, + "balance_loss_mlp": 1.01801169, + "epoch": 0.4133950580171947, + "flos": 32232520016640.0, + "grad_norm": 1.6900278272702725, + "language_loss": 0.74777019, + "learning_rate": 2.6464122943120818e-06, + "loss": 0.76947063, + "num_input_tokens_seen": 74242480, + "step": 3438, + "time_per_iteration": 2.6100597381591797 + }, + { + "auxiliary_loss_clip": 0.01142309, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.05198514, + "balance_loss_mlp": 1.01710606, + "epoch": 0.41351530090783384, + "flos": 23292487059840.0, + "grad_norm": 2.8451429782575164, + "language_loss": 0.82532728, + "learning_rate": 2.645675082165642e-06, + "loss": 0.8470062, + "num_input_tokens_seen": 74258690, + "step": 3439, + "time_per_iteration": 2.526207447052002 + }, + { + "auxiliary_loss_clip": 0.0115597, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.05181456, + "balance_loss_mlp": 1.0220449, + "epoch": 0.4136355437984729, + "flos": 25593571111680.0, + "grad_norm": 1.9902155610622383, + "language_loss": 0.75291467, + "learning_rate": 2.644937772070806e-06, + "loss": 0.77478182, + "num_input_tokens_seen": 74277135, + "step": 3440, + "time_per_iteration": 2.5322275161743164 + }, + { + "auxiliary_loss_clip": 0.01185336, + "auxiliary_loss_mlp": 0.0102605, + "balance_loss_clip": 1.05458558, + "balance_loss_mlp": 1.01757407, + "epoch": 0.413755786689112, + "flos": 19828615933440.0, + "grad_norm": 2.225910037209359, + "language_loss": 0.83527458, + "learning_rate": 2.6442003641394225e-06, + "loss": 0.8573885, + "num_input_tokens_seen": 74294730, + "step": 3441, + "time_per_iteration": 2.4213078022003174 + }, + { + "auxiliary_loss_clip": 0.01153504, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.04937196, + "balance_loss_mlp": 1.01957822, + "epoch": 0.4138760295797511, + "flos": 26870446759680.0, + "grad_norm": 1.564187576833067, + "language_loss": 0.83831024, + "learning_rate": 2.643462858483356e-06, + "loss": 0.86011875, + "num_input_tokens_seen": 74315015, + "step": 3442, + "time_per_iteration": 2.5670652389526367 + }, + { + "auxiliary_loss_clip": 0.01125422, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.04939699, + "balance_loss_mlp": 1.02306676, + "epoch": 0.41399627247039017, + "flos": 16399254798720.0, + "grad_norm": 1.8494849239088171, + "language_loss": 0.72705418, + "learning_rate": 2.6427252552144856e-06, + "loss": 0.74862754, + "num_input_tokens_seen": 74333665, + "step": 3443, + "time_per_iteration": 2.5562806129455566 + }, + { + "auxiliary_loss_clip": 0.0118492, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.05346036, + "balance_loss_mlp": 1.02501965, + "epoch": 0.4141165153610293, + "flos": 22930220442240.0, + "grad_norm": 2.140378889839288, + "language_loss": 0.75185335, + "learning_rate": 2.6419875544447044e-06, + "loss": 0.77403891, + "num_input_tokens_seen": 74355065, + "step": 3444, + "time_per_iteration": 2.5564467906951904 + }, + { + "auxiliary_loss_clip": 0.01185332, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.05342913, + "balance_loss_mlp": 1.02368903, + "epoch": 0.4142367582516684, + "flos": 25192556697600.0, + "grad_norm": 1.764905075102477, + "language_loss": 0.71788776, + "learning_rate": 2.6412497562859218e-06, + "loss": 0.74006623, + "num_input_tokens_seen": 74376345, + "step": 3445, + "time_per_iteration": 2.4802005290985107 + }, + { + "auxiliary_loss_clip": 0.01173914, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.05203605, + "balance_loss_mlp": 1.02009642, + "epoch": 0.41435700114230745, + "flos": 21690476478720.0, + "grad_norm": 2.1571402197682286, + "language_loss": 0.76271755, + "learning_rate": 2.6405118608500617e-06, + "loss": 0.78474349, + "num_input_tokens_seen": 74395170, + "step": 3446, + "time_per_iteration": 2.4645159244537354 + }, + { + "auxiliary_loss_clip": 0.01136026, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.0526613, + "balance_loss_mlp": 1.01902962, + "epoch": 0.41447724403294656, + "flos": 25995160143360.0, + "grad_norm": 2.178869720562306, + "language_loss": 0.81265402, + "learning_rate": 2.6397738682490613e-06, + "loss": 0.83428293, + "num_input_tokens_seen": 74416070, + "step": 3447, + "time_per_iteration": 2.579881191253662 + }, + { + "auxiliary_loss_clip": 0.01183748, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.05414104, + "balance_loss_mlp": 1.01855087, + "epoch": 0.41459748692358567, + "flos": 18259678800000.0, + "grad_norm": 2.4376251487091425, + "language_loss": 0.75320101, + "learning_rate": 2.6390357785948734e-06, + "loss": 0.77530515, + "num_input_tokens_seen": 74433185, + "step": 3448, + "time_per_iteration": 2.4142868518829346 + }, + { + "auxiliary_loss_clip": 0.01171666, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0545063, + "balance_loss_mlp": 1.02364588, + "epoch": 0.4147177298142247, + "flos": 24168456034560.0, + "grad_norm": 1.705961680271532, + "language_loss": 0.80198503, + "learning_rate": 2.6382975919994667e-06, + "loss": 0.82402205, + "num_input_tokens_seen": 74453760, + "step": 3449, + "time_per_iteration": 2.4977855682373047 + }, + { + "auxiliary_loss_clip": 0.01157141, + "auxiliary_loss_mlp": 0.01026213, + "balance_loss_clip": 1.05161262, + "balance_loss_mlp": 1.01876807, + "epoch": 0.41483797270486383, + "flos": 20084659056000.0, + "grad_norm": 1.5213965130436213, + "language_loss": 0.72941345, + "learning_rate": 2.637559308574822e-06, + "loss": 0.75124699, + "num_input_tokens_seen": 74473505, + "step": 3450, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01183914, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.0540688, + "balance_loss_mlp": 1.02298522, + "epoch": 0.4149582155955029, + "flos": 30081040110720.0, + "grad_norm": 2.125749644174213, + "language_loss": 0.70971662, + "learning_rate": 2.6368209284329376e-06, + "loss": 0.73186505, + "num_input_tokens_seen": 74494135, + "step": 3451, + "time_per_iteration": 2.496692657470703 + }, + { + "auxiliary_loss_clip": 0.01167887, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.04943669, + "balance_loss_mlp": 1.02143788, + "epoch": 0.415078458486142, + "flos": 16764394504320.0, + "grad_norm": 1.8856325771597418, + "language_loss": 0.74971163, + "learning_rate": 2.636082451685825e-06, + "loss": 0.77168834, + "num_input_tokens_seen": 74512335, + "step": 3452, + "time_per_iteration": 2.426292657852173 + }, + { + "auxiliary_loss_clip": 0.01159274, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.05352104, + "balance_loss_mlp": 1.01776731, + "epoch": 0.4151987013767811, + "flos": 26033692458240.0, + "grad_norm": 1.5080057706844285, + "language_loss": 0.86405468, + "learning_rate": 2.6353438784455094e-06, + "loss": 0.88590568, + "num_input_tokens_seen": 74535620, + "step": 3453, + "time_per_iteration": 2.5559446811676025 + }, + { + "auxiliary_loss_clip": 0.01154006, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.05315661, + "balance_loss_mlp": 1.02099359, + "epoch": 0.41531894426742016, + "flos": 24608002763520.0, + "grad_norm": 2.2212199936848736, + "language_loss": 0.72012186, + "learning_rate": 2.6346052088240326e-06, + "loss": 0.74196237, + "num_input_tokens_seen": 74555140, + "step": 3454, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01156956, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.04999161, + "balance_loss_mlp": 1.02067614, + "epoch": 0.4154391871580593, + "flos": 14975791747200.0, + "grad_norm": 2.11453114294564, + "language_loss": 0.7770049, + "learning_rate": 2.63386644293345e-06, + "loss": 0.79886699, + "num_input_tokens_seen": 74571485, + "step": 3455, + "time_per_iteration": 3.2934179306030273 + }, + { + "auxiliary_loss_clip": 0.0113483, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.04548907, + "balance_loss_mlp": 1.02170956, + "epoch": 0.4155594300486984, + "flos": 14647173194880.0, + "grad_norm": 2.52187612389324, + "language_loss": 0.82948858, + "learning_rate": 2.633127580885833e-06, + "loss": 0.85113031, + "num_input_tokens_seen": 74585985, + "step": 3456, + "time_per_iteration": 2.497652769088745 + }, + { + "auxiliary_loss_clip": 0.01183387, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.05622053, + "balance_loss_mlp": 1.02385318, + "epoch": 0.41567967293933744, + "flos": 29497276275840.0, + "grad_norm": 2.0215106515898817, + "language_loss": 0.64998156, + "learning_rate": 2.632388622793265e-06, + "loss": 0.67213559, + "num_input_tokens_seen": 74605140, + "step": 3457, + "time_per_iteration": 2.494825839996338 + }, + { + "auxiliary_loss_clip": 0.01170144, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.05432391, + "balance_loss_mlp": 1.02155519, + "epoch": 0.41579991582997655, + "flos": 19238387650560.0, + "grad_norm": 1.7349252082218465, + "language_loss": 0.67804873, + "learning_rate": 2.6316495687678457e-06, + "loss": 0.70004714, + "num_input_tokens_seen": 74623790, + "step": 3458, + "time_per_iteration": 2.468646287918091 + }, + { + "auxiliary_loss_clip": 0.01120027, + "auxiliary_loss_mlp": 0.01024914, + "balance_loss_clip": 1.04591346, + "balance_loss_mlp": 1.01664686, + "epoch": 0.41592015872061566, + "flos": 24462061804800.0, + "grad_norm": 2.88710394098374, + "language_loss": 0.76728904, + "learning_rate": 2.6309104189216887e-06, + "loss": 0.78873843, + "num_input_tokens_seen": 74641355, + "step": 3459, + "time_per_iteration": 3.4454345703125 + }, + { + "auxiliary_loss_clip": 0.01128556, + "auxiliary_loss_mlp": 0.00763053, + "balance_loss_clip": 1.0470171, + "balance_loss_mlp": 1.00022221, + "epoch": 0.4160404016112547, + "flos": 20775651966720.0, + "grad_norm": 2.385908597627895, + "language_loss": 0.74884462, + "learning_rate": 2.630171173366923e-06, + "loss": 0.76776075, + "num_input_tokens_seen": 74657155, + "step": 3460, + "time_per_iteration": 2.5208818912506104 + }, + { + "auxiliary_loss_clip": 0.01124696, + "auxiliary_loss_mlp": 0.01025763, + "balance_loss_clip": 1.04675901, + "balance_loss_mlp": 1.0173943, + "epoch": 0.41616064450189383, + "flos": 13916462820480.0, + "grad_norm": 2.436091787653902, + "language_loss": 0.74587989, + "learning_rate": 2.629431832215691e-06, + "loss": 0.76738441, + "num_input_tokens_seen": 74671960, + "step": 3461, + "time_per_iteration": 2.5265612602233887 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01022683, + "balance_loss_clip": 1.0486927, + "balance_loss_mlp": 1.01473808, + "epoch": 0.41628088739253294, + "flos": 20010826650240.0, + "grad_norm": 1.5957769260540788, + "language_loss": 0.87647814, + "learning_rate": 2.628692395580151e-06, + "loss": 0.89819407, + "num_input_tokens_seen": 74692050, + "step": 3462, + "time_per_iteration": 3.3256468772888184 + }, + { + "auxiliary_loss_clip": 0.01095397, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04392695, + "balance_loss_mlp": 1.01952004, + "epoch": 0.416401130283172, + "flos": 29168801377920.0, + "grad_norm": 1.9496326518963858, + "language_loss": 0.79516274, + "learning_rate": 2.6279528635724747e-06, + "loss": 0.81639552, + "num_input_tokens_seen": 74712205, + "step": 3463, + "time_per_iteration": 3.423001527786255 + }, + { + "auxiliary_loss_clip": 0.01169522, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.05216312, + "balance_loss_mlp": 1.01903939, + "epoch": 0.4165213731738111, + "flos": 16246813478400.0, + "grad_norm": 3.344907597717156, + "language_loss": 0.78575432, + "learning_rate": 2.627213236304848e-06, + "loss": 0.80772573, + "num_input_tokens_seen": 74729005, + "step": 3464, + "time_per_iteration": 2.4564061164855957 + }, + { + "auxiliary_loss_clip": 0.01173144, + "auxiliary_loss_mlp": 0.01026151, + "balance_loss_clip": 1.05422044, + "balance_loss_mlp": 1.01782405, + "epoch": 0.4166416160644502, + "flos": 33765438787200.0, + "grad_norm": 1.9955075768981707, + "language_loss": 0.70592636, + "learning_rate": 2.626473513889472e-06, + "loss": 0.72791928, + "num_input_tokens_seen": 74751385, + "step": 3465, + "time_per_iteration": 2.591647148132324 + }, + { + "auxiliary_loss_clip": 0.01159776, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.05077195, + "balance_loss_mlp": 1.02512693, + "epoch": 0.41676185895508927, + "flos": 20917498775040.0, + "grad_norm": 1.806155721163052, + "language_loss": 0.83063149, + "learning_rate": 2.625733696438562e-06, + "loss": 0.85256219, + "num_input_tokens_seen": 74768890, + "step": 3466, + "time_per_iteration": 2.469381332397461 + }, + { + "auxiliary_loss_clip": 0.01152033, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.05059648, + "balance_loss_mlp": 1.0249387, + "epoch": 0.4168821018457284, + "flos": 18406122549120.0, + "grad_norm": 1.6478267641127824, + "language_loss": 0.75351608, + "learning_rate": 2.6249937840643476e-06, + "loss": 0.77536607, + "num_input_tokens_seen": 74787195, + "step": 3467, + "time_per_iteration": 2.4977591037750244 + }, + { + "auxiliary_loss_clip": 0.01183263, + "auxiliary_loss_mlp": 0.0076253, + "balance_loss_clip": 1.05469751, + "balance_loss_mlp": 1.00021815, + "epoch": 0.41700234473636744, + "flos": 18698399516160.0, + "grad_norm": 1.8950878755130762, + "language_loss": 0.66750717, + "learning_rate": 2.6242537768790733e-06, + "loss": 0.68696517, + "num_input_tokens_seen": 74806350, + "step": 3468, + "time_per_iteration": 2.4356980323791504 + }, + { + "auxiliary_loss_clip": 0.01170564, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.05308676, + "balance_loss_mlp": 1.01998937, + "epoch": 0.41712258762700655, + "flos": 31033283616000.0, + "grad_norm": 1.8723668590831297, + "language_loss": 0.68660635, + "learning_rate": 2.6235136749949975e-06, + "loss": 0.70859694, + "num_input_tokens_seen": 74829800, + "step": 3469, + "time_per_iteration": 2.580603837966919 + }, + { + "auxiliary_loss_clip": 0.01180828, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.05264091, + "balance_loss_mlp": 1.02044451, + "epoch": 0.41724283051764566, + "flos": 35914763877120.0, + "grad_norm": 2.1710000814226094, + "language_loss": 0.61695391, + "learning_rate": 2.6227734785243924e-06, + "loss": 0.63905704, + "num_input_tokens_seen": 74849760, + "step": 3470, + "time_per_iteration": 2.570401906967163 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01025199, + "balance_loss_clip": 1.04281986, + "balance_loss_mlp": 1.0173018, + "epoch": 0.4173630734082847, + "flos": 25333649320320.0, + "grad_norm": 11.067771064294295, + "language_loss": 0.79270077, + "learning_rate": 2.6220331875795466e-06, + "loss": 0.81399167, + "num_input_tokens_seen": 74869110, + "step": 3471, + "time_per_iteration": 2.623163938522339 + }, + { + "auxiliary_loss_clip": 0.01165239, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.05230427, + "balance_loss_mlp": 1.02164721, + "epoch": 0.4174833162989238, + "flos": 26685398868480.0, + "grad_norm": 1.653054843406505, + "language_loss": 0.7503953, + "learning_rate": 2.62129280227276e-06, + "loss": 0.77234662, + "num_input_tokens_seen": 74889110, + "step": 3472, + "time_per_iteration": 2.5103964805603027 + }, + { + "auxiliary_loss_clip": 0.01172667, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.05238307, + "balance_loss_mlp": 1.0259347, + "epoch": 0.41760355918956293, + "flos": 74739584010240.0, + "grad_norm": 1.9367760155530178, + "language_loss": 0.68340445, + "learning_rate": 2.62055232271635e-06, + "loss": 0.705477, + "num_input_tokens_seen": 74916260, + "step": 3473, + "time_per_iteration": 2.8592793941497803 + }, + { + "auxiliary_loss_clip": 0.01130314, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.04696918, + "balance_loss_mlp": 1.01933026, + "epoch": 0.417723802080202, + "flos": 14317513148160.0, + "grad_norm": 1.945403972843583, + "language_loss": 0.87755072, + "learning_rate": 2.619811749022646e-06, + "loss": 0.89912778, + "num_input_tokens_seen": 74931570, + "step": 3474, + "time_per_iteration": 2.4995055198669434 + }, + { + "auxiliary_loss_clip": 0.01170781, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.05418348, + "balance_loss_mlp": 1.02320981, + "epoch": 0.4178440449708411, + "flos": 14643797316480.0, + "grad_norm": 2.291969290459892, + "language_loss": 0.71711504, + "learning_rate": 2.6190710813039917e-06, + "loss": 0.73914027, + "num_input_tokens_seen": 74944695, + "step": 3475, + "time_per_iteration": 2.429954767227173 + }, + { + "auxiliary_loss_clip": 0.01119317, + "auxiliary_loss_mlp": 0.00763779, + "balance_loss_clip": 1.04352081, + "balance_loss_mlp": 1.00018811, + "epoch": 0.4179642878614802, + "flos": 21507296094720.0, + "grad_norm": 2.1346174732178995, + "language_loss": 0.83984888, + "learning_rate": 2.618330319672747e-06, + "loss": 0.85867989, + "num_input_tokens_seen": 74964115, + "step": 3476, + "time_per_iteration": 2.5777759552001953 + }, + { + "auxiliary_loss_clip": 0.01183848, + "auxiliary_loss_mlp": 0.01027566, + "balance_loss_clip": 1.0543288, + "balance_loss_mlp": 1.01938236, + "epoch": 0.41808453075211927, + "flos": 18441997257600.0, + "grad_norm": 2.360735482633372, + "language_loss": 0.91842341, + "learning_rate": 2.617589464241284e-06, + "loss": 0.94053757, + "num_input_tokens_seen": 74978515, + "step": 3477, + "time_per_iteration": 2.418271780014038 + }, + { + "auxiliary_loss_clip": 0.01142993, + "auxiliary_loss_mlp": 0.01023973, + "balance_loss_clip": 1.05011487, + "balance_loss_mlp": 1.01650465, + "epoch": 0.4182047736427584, + "flos": 20301020628480.0, + "grad_norm": 1.989406817569932, + "language_loss": 0.74753785, + "learning_rate": 2.6168485151219914e-06, + "loss": 0.76920748, + "num_input_tokens_seen": 74998135, + "step": 3478, + "time_per_iteration": 2.545653820037842 + }, + { + "auxiliary_loss_clip": 0.01169943, + "auxiliary_loss_mlp": 0.01026444, + "balance_loss_clip": 1.05275559, + "balance_loss_mlp": 1.01809978, + "epoch": 0.4183250165333975, + "flos": 18876623823360.0, + "grad_norm": 7.008066589815805, + "language_loss": 0.71727669, + "learning_rate": 2.616107472427269e-06, + "loss": 0.73924047, + "num_input_tokens_seen": 75012830, + "step": 3479, + "time_per_iteration": 2.4293010234832764 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01025116, + "balance_loss_clip": 1.05257463, + "balance_loss_mlp": 1.01675367, + "epoch": 0.41844525942403654, + "flos": 17740050698880.0, + "grad_norm": 2.9363689596502995, + "language_loss": 0.76565349, + "learning_rate": 2.615366336269533e-06, + "loss": 0.78765464, + "num_input_tokens_seen": 75026495, + "step": 3480, + "time_per_iteration": 2.436577796936035 + }, + { + "auxiliary_loss_clip": 0.01185547, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.05353904, + "balance_loss_mlp": 1.02694905, + "epoch": 0.41856550231467565, + "flos": 18361377181440.0, + "grad_norm": 2.6185338129962386, + "language_loss": 0.80420965, + "learning_rate": 2.6146251067612126e-06, + "loss": 0.82642591, + "num_input_tokens_seen": 75041970, + "step": 3481, + "time_per_iteration": 2.412691354751587 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.05354917, + "balance_loss_mlp": 1.02039719, + "epoch": 0.41868574520531476, + "flos": 22781801445120.0, + "grad_norm": 1.5559728647902487, + "language_loss": 0.82525861, + "learning_rate": 2.6138837840147525e-06, + "loss": 0.84722614, + "num_input_tokens_seen": 75061005, + "step": 3482, + "time_per_iteration": 3.317591428756714 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.04896796, + "balance_loss_mlp": 1.01991975, + "epoch": 0.4188059880959538, + "flos": 13699167494400.0, + "grad_norm": 2.2546182438774176, + "language_loss": 0.76345217, + "learning_rate": 2.6131423681426103e-06, + "loss": 0.7851193, + "num_input_tokens_seen": 75076920, + "step": 3483, + "time_per_iteration": 2.500379800796509 + }, + { + "auxiliary_loss_clip": 0.01183503, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.0551976, + "balance_loss_mlp": 1.01827884, + "epoch": 0.41892623098659293, + "flos": 37818281220480.0, + "grad_norm": 1.6276821832869575, + "language_loss": 0.72819209, + "learning_rate": 2.6124008592572587e-06, + "loss": 0.75028282, + "num_input_tokens_seen": 75100905, + "step": 3484, + "time_per_iteration": 2.60031795501709 + }, + { + "auxiliary_loss_clip": 0.01186815, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.05375266, + "balance_loss_mlp": 1.01998127, + "epoch": 0.419046473877232, + "flos": 23258874908160.0, + "grad_norm": 3.085197331183568, + "language_loss": 0.82063305, + "learning_rate": 2.6116592574711835e-06, + "loss": 0.84278882, + "num_input_tokens_seen": 75119205, + "step": 3485, + "time_per_iteration": 2.4477596282958984 + }, + { + "auxiliary_loss_clip": 0.01187818, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.05585194, + "balance_loss_mlp": 1.02883172, + "epoch": 0.4191667167678711, + "flos": 20741034234240.0, + "grad_norm": 1.7808387905703396, + "language_loss": 0.84116077, + "learning_rate": 2.6109175628968853e-06, + "loss": 0.86341482, + "num_input_tokens_seen": 75138970, + "step": 3486, + "time_per_iteration": 3.312459707260132 + }, + { + "auxiliary_loss_clip": 0.01160745, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.05087364, + "balance_loss_mlp": 1.02054977, + "epoch": 0.4192869596585102, + "flos": 23586416052480.0, + "grad_norm": 1.8839716590873883, + "language_loss": 0.82592857, + "learning_rate": 2.610175775646878e-06, + "loss": 0.84781879, + "num_input_tokens_seen": 75157550, + "step": 3487, + "time_per_iteration": 2.485684394836426 + }, + { + "auxiliary_loss_clip": 0.01152335, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.04782867, + "balance_loss_mlp": 1.02097869, + "epoch": 0.41940720254914926, + "flos": 25081269384960.0, + "grad_norm": 1.9322295648543002, + "language_loss": 0.7250151, + "learning_rate": 2.6094338958336907e-06, + "loss": 0.74682987, + "num_input_tokens_seen": 75176220, + "step": 3488, + "time_per_iteration": 3.348875045776367 + }, + { + "auxiliary_loss_clip": 0.01155766, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.05182672, + "balance_loss_mlp": 1.01898026, + "epoch": 0.41952744543978837, + "flos": 15554132628480.0, + "grad_norm": 2.0032949221501264, + "language_loss": 0.82409155, + "learning_rate": 2.608691923569867e-06, + "loss": 0.84591979, + "num_input_tokens_seen": 75193095, + "step": 3489, + "time_per_iteration": 2.5133297443389893 + }, + { + "auxiliary_loss_clip": 0.01171894, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.05367529, + "balance_loss_mlp": 1.02166343, + "epoch": 0.4196476883304275, + "flos": 24644775312000.0, + "grad_norm": 1.6185373153168154, + "language_loss": 0.75680375, + "learning_rate": 2.6079498589679616e-06, + "loss": 0.77882075, + "num_input_tokens_seen": 75214185, + "step": 3490, + "time_per_iteration": 3.2565455436706543 + }, + { + "auxiliary_loss_clip": 0.01105126, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.0413866, + "balance_loss_mlp": 1.02512932, + "epoch": 0.41976793122106654, + "flos": 24531333183360.0, + "grad_norm": 2.767977131496339, + "language_loss": 0.75896966, + "learning_rate": 2.6072077021405465e-06, + "loss": 0.7803669, + "num_input_tokens_seen": 75233020, + "step": 3491, + "time_per_iteration": 2.621495485305786 + }, + { + "auxiliary_loss_clip": 0.01148216, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.04925704, + "balance_loss_mlp": 1.0259335, + "epoch": 0.41988817411170565, + "flos": 21175301664000.0, + "grad_norm": 1.697596358884439, + "language_loss": 0.68812382, + "learning_rate": 2.6064654532002054e-06, + "loss": 0.70994234, + "num_input_tokens_seen": 75252030, + "step": 3492, + "time_per_iteration": 2.519561767578125 + }, + { + "auxiliary_loss_clip": 0.01184474, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.05488694, + "balance_loss_mlp": 1.02591014, + "epoch": 0.42000841700234476, + "flos": 31649402626560.0, + "grad_norm": 3.9879169386297106, + "language_loss": 0.75748897, + "learning_rate": 2.6057231122595375e-06, + "loss": 0.7796734, + "num_input_tokens_seen": 75273340, + "step": 3493, + "time_per_iteration": 2.5316569805145264 + }, + { + "auxiliary_loss_clip": 0.01155415, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.04857957, + "balance_loss_mlp": 1.022614, + "epoch": 0.4201286598929838, + "flos": 21281525159040.0, + "grad_norm": 1.6304732473793624, + "language_loss": 0.72744763, + "learning_rate": 2.604980679431154e-06, + "loss": 0.74931419, + "num_input_tokens_seen": 75291580, + "step": 3494, + "time_per_iteration": 2.5114362239837646 + }, + { + "auxiliary_loss_clip": 0.01170537, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.04991305, + "balance_loss_mlp": 1.01922584, + "epoch": 0.4202489027836229, + "flos": 18546532813440.0, + "grad_norm": 1.990481141980804, + "language_loss": 0.7480433, + "learning_rate": 2.604238154827684e-06, + "loss": 0.77002001, + "num_input_tokens_seen": 75308205, + "step": 3495, + "time_per_iteration": 2.4571053981781006 + }, + { + "auxiliary_loss_clip": 0.01169991, + "auxiliary_loss_mlp": 0.01024945, + "balance_loss_clip": 1.05244744, + "balance_loss_mlp": 1.01691628, + "epoch": 0.42036914567426203, + "flos": 19317643009920.0, + "grad_norm": 1.8816238367556346, + "language_loss": 0.72657776, + "learning_rate": 2.6034955385617656e-06, + "loss": 0.74852717, + "num_input_tokens_seen": 75326535, + "step": 3496, + "time_per_iteration": 2.450303554534912 + }, + { + "auxiliary_loss_clip": 0.01051489, + "auxiliary_loss_mlp": 0.0100441, + "balance_loss_clip": 1.01800179, + "balance_loss_mlp": 1.00332499, + "epoch": 0.4204893885649011, + "flos": 67842942935040.0, + "grad_norm": 0.8101331624344884, + "language_loss": 0.61675388, + "learning_rate": 2.6027528307460544e-06, + "loss": 0.63731289, + "num_input_tokens_seen": 75390540, + "step": 3497, + "time_per_iteration": 3.181201219558716 + }, + { + "auxiliary_loss_clip": 0.01183969, + "auxiliary_loss_mlp": 0.01025552, + "balance_loss_clip": 1.05389047, + "balance_loss_mlp": 1.01807761, + "epoch": 0.4206096314555402, + "flos": 21908777385600.0, + "grad_norm": 2.066648623615736, + "language_loss": 0.86420363, + "learning_rate": 2.602010031493217e-06, + "loss": 0.88629889, + "num_input_tokens_seen": 75408770, + "step": 3498, + "time_per_iteration": 2.477231740951538 + }, + { + "auxiliary_loss_clip": 0.01136994, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.04853821, + "balance_loss_mlp": 1.02462125, + "epoch": 0.42072987434617926, + "flos": 29278185269760.0, + "grad_norm": 1.9352074136477548, + "language_loss": 0.86761618, + "learning_rate": 2.6012671409159367e-06, + "loss": 0.88931578, + "num_input_tokens_seen": 75430105, + "step": 3499, + "time_per_iteration": 2.5866153240203857 + }, + { + "auxiliary_loss_clip": 0.01150318, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.05144715, + "balance_loss_mlp": 1.02129054, + "epoch": 0.42085011723681837, + "flos": 27600726170880.0, + "grad_norm": 1.8928820323263145, + "language_loss": 0.81662506, + "learning_rate": 2.6005241591269097e-06, + "loss": 0.83842683, + "num_input_tokens_seen": 75449475, + "step": 3500, + "time_per_iteration": 2.536463975906372 + }, + { + "auxiliary_loss_clip": 0.01135685, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.05114555, + "balance_loss_mlp": 1.02265811, + "epoch": 0.4209703601274575, + "flos": 27818632028160.0, + "grad_norm": 1.7220329486448918, + "language_loss": 0.79491782, + "learning_rate": 2.5997810862388454e-06, + "loss": 0.81658089, + "num_input_tokens_seen": 75469315, + "step": 3501, + "time_per_iteration": 2.63895845413208 + }, + { + "auxiliary_loss_clip": 0.01155156, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.04859161, + "balance_loss_mlp": 1.02044547, + "epoch": 0.42109060301809653, + "flos": 27525529048320.0, + "grad_norm": 1.9726864276113674, + "language_loss": 0.75441056, + "learning_rate": 2.599037922364467e-06, + "loss": 0.77624857, + "num_input_tokens_seen": 75488215, + "step": 3502, + "time_per_iteration": 2.5483884811401367 + }, + { + "auxiliary_loss_clip": 0.01135715, + "auxiliary_loss_mlp": 0.01025802, + "balance_loss_clip": 1.04949701, + "balance_loss_mlp": 1.01752949, + "epoch": 0.42121084590873564, + "flos": 29314275459840.0, + "grad_norm": 2.1339329649153487, + "language_loss": 0.75387096, + "learning_rate": 2.5982946676165112e-06, + "loss": 0.77548611, + "num_input_tokens_seen": 75507985, + "step": 3503, + "time_per_iteration": 2.5771071910858154 + }, + { + "auxiliary_loss_clip": 0.01057201, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.03015924, + "balance_loss_mlp": 1.00238693, + "epoch": 0.42133108879937475, + "flos": 67398835178880.0, + "grad_norm": 0.7289518483371384, + "language_loss": 0.57607305, + "learning_rate": 2.5975513221077313e-06, + "loss": 0.59668487, + "num_input_tokens_seen": 75571955, + "step": 3504, + "time_per_iteration": 3.1944680213928223 + }, + { + "auxiliary_loss_clip": 0.01147057, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.05017352, + "balance_loss_mlp": 1.02370226, + "epoch": 0.4214513316900138, + "flos": 23106038538240.0, + "grad_norm": 2.582364925543384, + "language_loss": 0.88772297, + "learning_rate": 2.5968078859508897e-06, + "loss": 0.90951371, + "num_input_tokens_seen": 75589155, + "step": 3505, + "time_per_iteration": 2.4954302310943604 + }, + { + "auxiliary_loss_clip": 0.0116895, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.05304158, + "balance_loss_mlp": 1.02011251, + "epoch": 0.4215715745806529, + "flos": 15336190857600.0, + "grad_norm": 2.0607230537098666, + "language_loss": 0.80054593, + "learning_rate": 2.5960643592587673e-06, + "loss": 0.82251602, + "num_input_tokens_seen": 75606565, + "step": 3506, + "time_per_iteration": 2.4433610439300537 + }, + { + "auxiliary_loss_clip": 0.01141058, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.04900134, + "balance_loss_mlp": 1.01985788, + "epoch": 0.42169181747129203, + "flos": 22127257860480.0, + "grad_norm": 2.0123536169178435, + "language_loss": 0.81577432, + "learning_rate": 2.5953207421441553e-06, + "loss": 0.83745903, + "num_input_tokens_seen": 75625165, + "step": 3507, + "time_per_iteration": 2.540236473083496 + }, + { + "auxiliary_loss_clip": 0.01145165, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.05258, + "balance_loss_mlp": 1.0239414, + "epoch": 0.4218120603619311, + "flos": 22630724841600.0, + "grad_norm": 2.151817061630336, + "language_loss": 0.75429857, + "learning_rate": 2.5945770347198603e-06, + "loss": 0.77606642, + "num_input_tokens_seen": 75643320, + "step": 3508, + "time_per_iteration": 3.3331243991851807 + }, + { + "auxiliary_loss_clip": 0.01150369, + "auxiliary_loss_mlp": 0.01022054, + "balance_loss_clip": 1.04902506, + "balance_loss_mlp": 1.01468706, + "epoch": 0.4219323032525702, + "flos": 19682818629120.0, + "grad_norm": 1.6912967643455545, + "language_loss": 0.81626594, + "learning_rate": 2.593833237098701e-06, + "loss": 0.83799016, + "num_input_tokens_seen": 75660920, + "step": 3509, + "time_per_iteration": 2.510230779647827 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.04867649, + "balance_loss_mlp": 1.02115536, + "epoch": 0.4220525461432093, + "flos": 30190747224960.0, + "grad_norm": 1.8069589798945016, + "language_loss": 0.6215992, + "learning_rate": 2.593089349393512e-06, + "loss": 0.64356172, + "num_input_tokens_seen": 75681410, + "step": 3510, + "time_per_iteration": 2.546098470687866 + }, + { + "auxiliary_loss_clip": 0.01170095, + "auxiliary_loss_mlp": 0.01025437, + "balance_loss_clip": 1.05638552, + "balance_loss_mlp": 1.01773643, + "epoch": 0.42217278903384836, + "flos": 24315941278080.0, + "grad_norm": 2.119552249181925, + "language_loss": 0.83804792, + "learning_rate": 2.592345371717141e-06, + "loss": 0.86000323, + "num_input_tokens_seen": 75700940, + "step": 3511, + "time_per_iteration": 2.493624448776245 + }, + { + "auxiliary_loss_clip": 0.01172858, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.05883813, + "balance_loss_mlp": 1.02148247, + "epoch": 0.42229303192448747, + "flos": 17092474352640.0, + "grad_norm": 2.330988490580442, + "language_loss": 0.71804029, + "learning_rate": 2.591601304182448e-06, + "loss": 0.74006718, + "num_input_tokens_seen": 75718910, + "step": 3512, + "time_per_iteration": 3.300139904022217 + }, + { + "auxiliary_loss_clip": 0.01156208, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.05470657, + "balance_loss_mlp": 1.02029443, + "epoch": 0.4224132748151266, + "flos": 22784530878720.0, + "grad_norm": 1.7998042832617744, + "language_loss": 0.79349375, + "learning_rate": 2.5908571469023067e-06, + "loss": 0.81533056, + "num_input_tokens_seen": 75738395, + "step": 3513, + "time_per_iteration": 2.5223584175109863 + }, + { + "auxiliary_loss_clip": 0.01184027, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.05492163, + "balance_loss_mlp": 1.02240252, + "epoch": 0.42253351770576564, + "flos": 17819090576640.0, + "grad_norm": 2.464266565644855, + "language_loss": 0.75665057, + "learning_rate": 2.5901128999896067e-06, + "loss": 0.77879429, + "num_input_tokens_seen": 75753825, + "step": 3514, + "time_per_iteration": 2.401898145675659 + }, + { + "auxiliary_loss_clip": 0.01170024, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.05601716, + "balance_loss_mlp": 1.01997733, + "epoch": 0.42265376059640475, + "flos": 28512390286080.0, + "grad_norm": 1.591482796493947, + "language_loss": 0.67968857, + "learning_rate": 2.5893685635572487e-06, + "loss": 0.70166528, + "num_input_tokens_seen": 75774675, + "step": 3515, + "time_per_iteration": 3.2679989337921143 + }, + { + "auxiliary_loss_clip": 0.01157476, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.05532002, + "balance_loss_mlp": 1.02132416, + "epoch": 0.4227740034870438, + "flos": 16253349753600.0, + "grad_norm": 1.8927749769020958, + "language_loss": 0.68916708, + "learning_rate": 2.5886241377181483e-06, + "loss": 0.71103811, + "num_input_tokens_seen": 75793545, + "step": 3516, + "time_per_iteration": 2.508251428604126 + }, + { + "auxiliary_loss_clip": 0.01174479, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.05570459, + "balance_loss_mlp": 1.0204556, + "epoch": 0.4228942463776829, + "flos": 25295691623040.0, + "grad_norm": 1.7004153494485628, + "language_loss": 0.81485415, + "learning_rate": 2.587879622585234e-06, + "loss": 0.83688939, + "num_input_tokens_seen": 75812145, + "step": 3517, + "time_per_iteration": 3.2553234100341797 + }, + { + "auxiliary_loss_clip": 0.01169647, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.05565786, + "balance_loss_mlp": 1.02340662, + "epoch": 0.423014489268322, + "flos": 26395779507840.0, + "grad_norm": 2.354801716907938, + "language_loss": 0.75580376, + "learning_rate": 2.5871350182714486e-06, + "loss": 0.77781296, + "num_input_tokens_seen": 75833025, + "step": 3518, + "time_per_iteration": 2.603123188018799 + }, + { + "auxiliary_loss_clip": 0.01183822, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.0559907, + "balance_loss_mlp": 1.01895237, + "epoch": 0.4231347321589611, + "flos": 17274002711040.0, + "grad_norm": 2.167273863329668, + "language_loss": 0.80323267, + "learning_rate": 2.586390324889748e-06, + "loss": 0.82533491, + "num_input_tokens_seen": 75848925, + "step": 3519, + "time_per_iteration": 2.456437110900879 + }, + { + "auxiliary_loss_clip": 0.01171034, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.05701935, + "balance_loss_mlp": 1.02421975, + "epoch": 0.4232549750496002, + "flos": 22999635475200.0, + "grad_norm": 1.8381712125618284, + "language_loss": 0.67579174, + "learning_rate": 2.5856455425531003e-06, + "loss": 0.69781828, + "num_input_tokens_seen": 75870400, + "step": 3520, + "time_per_iteration": 2.4923095703125 + }, + { + "auxiliary_loss_clip": 0.01171643, + "auxiliary_loss_mlp": 0.0102523, + "balance_loss_clip": 1.05672765, + "balance_loss_mlp": 1.01748681, + "epoch": 0.4233752179402393, + "flos": 21248343970560.0, + "grad_norm": 1.7376983949816318, + "language_loss": 0.80391335, + "learning_rate": 2.5849006713744902e-06, + "loss": 0.82588208, + "num_input_tokens_seen": 75889195, + "step": 3521, + "time_per_iteration": 2.4542765617370605 + }, + { + "auxiliary_loss_clip": 0.01154491, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.0523802, + "balance_loss_mlp": 1.02002847, + "epoch": 0.42349546083087836, + "flos": 20704297599360.0, + "grad_norm": 2.044537984655198, + "language_loss": 0.72736812, + "learning_rate": 2.5841557114669135e-06, + "loss": 0.7491951, + "num_input_tokens_seen": 75906055, + "step": 3522, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01189986, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.05591679, + "balance_loss_mlp": 1.02198803, + "epoch": 0.42361570372151747, + "flos": 18585065128320.0, + "grad_norm": 9.759805701092263, + "language_loss": 0.67134333, + "learning_rate": 2.58341066294338e-06, + "loss": 0.69354922, + "num_input_tokens_seen": 75922720, + "step": 3523, + "time_per_iteration": 2.4124362468719482 + }, + { + "auxiliary_loss_clip": 0.0113476, + "auxiliary_loss_mlp": 0.00762849, + "balance_loss_clip": 1.05186999, + "balance_loss_mlp": 1.00041032, + "epoch": 0.4237359466121566, + "flos": 20959478795520.0, + "grad_norm": 2.3431021267201904, + "language_loss": 0.85559934, + "learning_rate": 2.5826655259169124e-06, + "loss": 0.87457538, + "num_input_tokens_seen": 75941375, + "step": 3524, + "time_per_iteration": 2.602137327194214 + }, + { + "auxiliary_loss_clip": 0.01188391, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.05849552, + "balance_loss_mlp": 1.02301896, + "epoch": 0.42385618950279563, + "flos": 18038181582720.0, + "grad_norm": 2.1170359855964445, + "language_loss": 0.90596938, + "learning_rate": 2.5819203005005475e-06, + "loss": 0.92816412, + "num_input_tokens_seen": 75958710, + "step": 3525, + "time_per_iteration": 2.5590906143188477 + }, + { + "auxiliary_loss_clip": 0.01152699, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.05451465, + "balance_loss_mlp": 1.02406764, + "epoch": 0.42397643239343474, + "flos": 23769129559680.0, + "grad_norm": 2.016743251961724, + "language_loss": 0.78911197, + "learning_rate": 2.581174986807336e-06, + "loss": 0.81095731, + "num_input_tokens_seen": 75978945, + "step": 3526, + "time_per_iteration": 2.5470805168151855 + }, + { + "auxiliary_loss_clip": 0.01162197, + "auxiliary_loss_mlp": 0.00762812, + "balance_loss_clip": 1.05285668, + "balance_loss_mlp": 1.00040948, + "epoch": 0.42409667528407385, + "flos": 16545088016640.0, + "grad_norm": 2.2881186211263764, + "language_loss": 0.9098134, + "learning_rate": 2.580429584950341e-06, + "loss": 0.92906344, + "num_input_tokens_seen": 75994695, + "step": 3527, + "time_per_iteration": 2.4381277561187744 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.05291605, + "balance_loss_mlp": 1.01747012, + "epoch": 0.4242169181747129, + "flos": 16034186920320.0, + "grad_norm": 2.0953409189123198, + "language_loss": 0.66436046, + "learning_rate": 2.5796840950426397e-06, + "loss": 0.68611354, + "num_input_tokens_seen": 76011780, + "step": 3528, + "time_per_iteration": 2.5010526180267334 + }, + { + "auxiliary_loss_clip": 0.0116075, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.05131042, + "balance_loss_mlp": 1.02163661, + "epoch": 0.424337161065352, + "flos": 20084012611200.0, + "grad_norm": 1.8718419769513992, + "language_loss": 0.65617955, + "learning_rate": 2.578938517197322e-06, + "loss": 0.67807996, + "num_input_tokens_seen": 76029875, + "step": 3529, + "time_per_iteration": 2.463440179824829 + }, + { + "auxiliary_loss_clip": 0.01146128, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.05065584, + "balance_loss_mlp": 1.01976764, + "epoch": 0.4244574039559911, + "flos": 23878369797120.0, + "grad_norm": 2.1711844819422317, + "language_loss": 0.62226725, + "learning_rate": 2.5781928515274916e-06, + "loss": 0.6440056, + "num_input_tokens_seen": 76048595, + "step": 3530, + "time_per_iteration": 2.522073268890381 + }, + { + "auxiliary_loss_clip": 0.01175823, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.0578289, + "balance_loss_mlp": 1.02146935, + "epoch": 0.4245776468466302, + "flos": 17565920542080.0, + "grad_norm": 1.91141545402392, + "language_loss": 0.67646921, + "learning_rate": 2.577447098146265e-06, + "loss": 0.69851661, + "num_input_tokens_seen": 76065770, + "step": 3531, + "time_per_iteration": 2.4430432319641113 + }, + { + "auxiliary_loss_clip": 0.01142812, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.05106425, + "balance_loss_mlp": 1.02803016, + "epoch": 0.4246978897372693, + "flos": 27776256958080.0, + "grad_norm": 1.749077554336442, + "language_loss": 0.78845739, + "learning_rate": 2.5767012571667724e-06, + "loss": 0.81024373, + "num_input_tokens_seen": 76085250, + "step": 3532, + "time_per_iteration": 2.579741954803467 + }, + { + "auxiliary_loss_clip": 0.0117314, + "auxiliary_loss_mlp": 0.01025766, + "balance_loss_clip": 1.05347848, + "balance_loss_mlp": 1.01669431, + "epoch": 0.42481813262790835, + "flos": 15596615439360.0, + "grad_norm": 2.0263533953766473, + "language_loss": 0.67991364, + "learning_rate": 2.5759553287021587e-06, + "loss": 0.70190275, + "num_input_tokens_seen": 76103580, + "step": 3533, + "time_per_iteration": 2.4520928859710693 + }, + { + "auxiliary_loss_clip": 0.01155958, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.05504012, + "balance_loss_mlp": 1.01883841, + "epoch": 0.42493837551854746, + "flos": 23951088881280.0, + "grad_norm": 2.0860933239739214, + "language_loss": 0.77319604, + "learning_rate": 2.5752093128655786e-06, + "loss": 0.79503024, + "num_input_tokens_seen": 76121825, + "step": 3534, + "time_per_iteration": 2.5165352821350098 + }, + { + "auxiliary_loss_clip": 0.01148916, + "auxiliary_loss_mlp": 0.01025351, + "balance_loss_clip": 1.05022907, + "balance_loss_mlp": 1.01698267, + "epoch": 0.4250586184091866, + "flos": 20813466009600.0, + "grad_norm": 2.29887617168106, + "language_loss": 0.73720282, + "learning_rate": 2.574463209770204e-06, + "loss": 0.75894547, + "num_input_tokens_seen": 76141140, + "step": 3535, + "time_per_iteration": 3.3060848712921143 + }, + { + "auxiliary_loss_clip": 0.0114067, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.05043602, + "balance_loss_mlp": 1.02297163, + "epoch": 0.42517886129982563, + "flos": 30371018607360.0, + "grad_norm": 1.719612265982136, + "language_loss": 0.79694527, + "learning_rate": 2.5737170195292165e-06, + "loss": 0.81866461, + "num_input_tokens_seen": 76164475, + "step": 3536, + "time_per_iteration": 2.6351466178894043 + }, + { + "auxiliary_loss_clip": 0.01141957, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.04971194, + "balance_loss_mlp": 1.01990306, + "epoch": 0.42529910419046474, + "flos": 20080636732800.0, + "grad_norm": 2.000824768490275, + "language_loss": 0.78247178, + "learning_rate": 2.572970742255814e-06, + "loss": 0.80417824, + "num_input_tokens_seen": 76182965, + "step": 3537, + "time_per_iteration": 2.5373334884643555 + }, + { + "auxiliary_loss_clip": 0.01171066, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.05716693, + "balance_loss_mlp": 1.02056146, + "epoch": 0.42541934708110385, + "flos": 22632448694400.0, + "grad_norm": 1.723061995218477, + "language_loss": 0.81909847, + "learning_rate": 2.5722243780632046e-06, + "loss": 0.84109116, + "num_input_tokens_seen": 76201230, + "step": 3538, + "time_per_iteration": 2.487924098968506 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.01004675, + "balance_loss_clip": 1.02050674, + "balance_loss_mlp": 1.00367403, + "epoch": 0.4255395899717429, + "flos": 66200676186240.0, + "grad_norm": 0.7732521984192919, + "language_loss": 0.6047954, + "learning_rate": 2.5714779270646125e-06, + "loss": 0.62527603, + "num_input_tokens_seen": 76262000, + "step": 3539, + "time_per_iteration": 3.9092202186584473 + }, + { + "auxiliary_loss_clip": 0.01162388, + "auxiliary_loss_mlp": 0.0076273, + "balance_loss_clip": 1.05590367, + "balance_loss_mlp": 1.00038922, + "epoch": 0.425659832862382, + "flos": 17931814433280.0, + "grad_norm": 2.6841389965686613, + "language_loss": 0.77799147, + "learning_rate": 2.5707313893732735e-06, + "loss": 0.79724264, + "num_input_tokens_seen": 76280540, + "step": 3540, + "time_per_iteration": 2.483970880508423 + }, + { + "auxiliary_loss_clip": 0.01091331, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.04120362, + "balance_loss_mlp": 1.02106535, + "epoch": 0.4257800757530211, + "flos": 24022550989440.0, + "grad_norm": 1.6796403842919871, + "language_loss": 0.77069861, + "learning_rate": 2.5699847651024364e-06, + "loss": 0.79190314, + "num_input_tokens_seen": 76301180, + "step": 3541, + "time_per_iteration": 2.65840482711792 + }, + { + "auxiliary_loss_clip": 0.01169351, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.0560087, + "balance_loss_mlp": 1.02093124, + "epoch": 0.4259003186436602, + "flos": 23696015425920.0, + "grad_norm": 3.6313339743253934, + "language_loss": 0.76857674, + "learning_rate": 2.5692380543653627e-06, + "loss": 0.7905584, + "num_input_tokens_seen": 76319335, + "step": 3542, + "time_per_iteration": 3.2987897396087646 + }, + { + "auxiliary_loss_clip": 0.01175421, + "auxiliary_loss_mlp": 0.00763123, + "balance_loss_clip": 1.05565202, + "balance_loss_mlp": 1.00047863, + "epoch": 0.4260205615342993, + "flos": 15259772672640.0, + "grad_norm": 2.0202996225289915, + "language_loss": 0.69757605, + "learning_rate": 2.5684912572753293e-06, + "loss": 0.7169615, + "num_input_tokens_seen": 76335010, + "step": 3543, + "time_per_iteration": 2.4561078548431396 + }, + { + "auxiliary_loss_clip": 0.01181584, + "auxiliary_loss_mlp": 0.01022966, + "balance_loss_clip": 1.0555141, + "balance_loss_mlp": 1.01529479, + "epoch": 0.4261408044249384, + "flos": 30665306736000.0, + "grad_norm": 1.7189432327803047, + "language_loss": 0.84336442, + "learning_rate": 2.5677443739456245e-06, + "loss": 0.86540991, + "num_input_tokens_seen": 76356670, + "step": 3544, + "time_per_iteration": 3.206366777420044 + }, + { + "auxiliary_loss_clip": 0.01158528, + "auxiliary_loss_mlp": 0.01023892, + "balance_loss_clip": 1.05613399, + "balance_loss_mlp": 1.01613712, + "epoch": 0.42626104731557746, + "flos": 23257905240960.0, + "grad_norm": 2.3803050649695447, + "language_loss": 0.79395318, + "learning_rate": 2.5669974044895495e-06, + "loss": 0.8157773, + "num_input_tokens_seen": 76373065, + "step": 3545, + "time_per_iteration": 2.509317636489868 + }, + { + "auxiliary_loss_clip": 0.01149294, + "auxiliary_loss_mlp": 0.0102665, + "balance_loss_clip": 1.051085, + "balance_loss_mlp": 1.01846027, + "epoch": 0.42638129020621657, + "flos": 25884770670720.0, + "grad_norm": 1.8138901651688484, + "language_loss": 0.79966331, + "learning_rate": 2.5662503490204187e-06, + "loss": 0.82142276, + "num_input_tokens_seen": 76393230, + "step": 3546, + "time_per_iteration": 2.57382869720459 + }, + { + "auxiliary_loss_clip": 0.0115434, + "auxiliary_loss_mlp": 0.01022785, + "balance_loss_clip": 1.0512619, + "balance_loss_mlp": 1.01500082, + "epoch": 0.4265015330968556, + "flos": 26502362138880.0, + "grad_norm": 2.0335016009604905, + "language_loss": 0.76303583, + "learning_rate": 2.5655032076515603e-06, + "loss": 0.78480709, + "num_input_tokens_seen": 76412555, + "step": 3547, + "time_per_iteration": 2.564664840698242 + }, + { + "auxiliary_loss_clip": 0.01158405, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.05305004, + "balance_loss_mlp": 1.01913369, + "epoch": 0.42662177598749473, + "flos": 24389522288640.0, + "grad_norm": 2.8284615523244767, + "language_loss": 0.82254064, + "learning_rate": 2.5647559804963155e-06, + "loss": 0.84439582, + "num_input_tokens_seen": 76432485, + "step": 3548, + "time_per_iteration": 2.5287795066833496 + }, + { + "auxiliary_loss_clip": 0.01138534, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.05347896, + "balance_loss_mlp": 1.02463269, + "epoch": 0.42674201887813384, + "flos": 23148629089920.0, + "grad_norm": 1.981451402656454, + "language_loss": 0.78943217, + "learning_rate": 2.5640086676680364e-06, + "loss": 0.81114197, + "num_input_tokens_seen": 76453980, + "step": 3549, + "time_per_iteration": 2.6089608669281006 + }, + { + "auxiliary_loss_clip": 0.01173815, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.05610466, + "balance_loss_mlp": 1.0191896, + "epoch": 0.4268622617687729, + "flos": 21689614552320.0, + "grad_norm": 2.294518990482537, + "language_loss": 0.80787909, + "learning_rate": 2.5632612692800923e-06, + "loss": 0.82989573, + "num_input_tokens_seen": 76473045, + "step": 3550, + "time_per_iteration": 2.4722511768341064 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.05048192, + "balance_loss_mlp": 1.02725244, + "epoch": 0.426982504659412, + "flos": 23440151871360.0, + "grad_norm": 1.9535029950736376, + "language_loss": 0.75238627, + "learning_rate": 2.5625137854458603e-06, + "loss": 0.77418023, + "num_input_tokens_seen": 76492060, + "step": 3551, + "time_per_iteration": 2.5510425567626953 + }, + { + "auxiliary_loss_clip": 0.01160055, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.05492651, + "balance_loss_mlp": 1.02156794, + "epoch": 0.4271027475500511, + "flos": 18916556768640.0, + "grad_norm": 1.9293302171028055, + "language_loss": 0.80180645, + "learning_rate": 2.561766216278735e-06, + "loss": 0.82369727, + "num_input_tokens_seen": 76509655, + "step": 3552, + "time_per_iteration": 2.4863080978393555 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.05039573, + "balance_loss_mlp": 1.02047825, + "epoch": 0.4272229904406902, + "flos": 26870554500480.0, + "grad_norm": 1.7434203483395674, + "language_loss": 0.80870008, + "learning_rate": 2.561018561892121e-06, + "loss": 0.83025783, + "num_input_tokens_seen": 76528795, + "step": 3553, + "time_per_iteration": 2.5821099281311035 + }, + { + "auxiliary_loss_clip": 0.01153879, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.05030298, + "balance_loss_mlp": 1.02555037, + "epoch": 0.4273432333313293, + "flos": 23951376190080.0, + "grad_norm": 1.5333451459252483, + "language_loss": 0.76568747, + "learning_rate": 2.5602708223994363e-06, + "loss": 0.78755951, + "num_input_tokens_seen": 76550660, + "step": 3554, + "time_per_iteration": 2.5269036293029785 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.04758322, + "balance_loss_mlp": 1.01794851, + "epoch": 0.4274634762219684, + "flos": 29570354496000.0, + "grad_norm": 2.1282753049550367, + "language_loss": 0.67153227, + "learning_rate": 2.559522997914115e-06, + "loss": 0.69322121, + "num_input_tokens_seen": 76570240, + "step": 3555, + "time_per_iteration": 2.5892279148101807 + }, + { + "auxiliary_loss_clip": 0.01184469, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.05815864, + "balance_loss_mlp": 1.02251947, + "epoch": 0.42758371911260745, + "flos": 21434146047360.0, + "grad_norm": 1.9272640727724966, + "language_loss": 0.84807754, + "learning_rate": 2.558775088549599e-06, + "loss": 0.87022161, + "num_input_tokens_seen": 76589820, + "step": 3556, + "time_per_iteration": 2.4482078552246094 + }, + { + "auxiliary_loss_clip": 0.0117736, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.05546021, + "balance_loss_mlp": 1.02057838, + "epoch": 0.42770396200324656, + "flos": 14752822072320.0, + "grad_norm": 4.6819192163826555, + "language_loss": 0.66522568, + "learning_rate": 2.5580270944193467e-06, + "loss": 0.68728727, + "num_input_tokens_seen": 76606640, + "step": 3557, + "time_per_iteration": 2.419623851776123 + }, + { + "auxiliary_loss_clip": 0.01084349, + "auxiliary_loss_mlp": 0.01003471, + "balance_loss_clip": 1.02027535, + "balance_loss_mlp": 1.0024159, + "epoch": 0.4278242048938857, + "flos": 70654712601600.0, + "grad_norm": 0.7595784660188609, + "language_loss": 0.55505395, + "learning_rate": 2.557279015636827e-06, + "loss": 0.57593215, + "num_input_tokens_seen": 76667050, + "step": 3558, + "time_per_iteration": 3.0204498767852783 + }, + { + "auxiliary_loss_clip": 0.01070572, + "auxiliary_loss_mlp": 0.01003582, + "balance_loss_clip": 1.0200609, + "balance_loss_mlp": 1.00245595, + "epoch": 0.42794444778452473, + "flos": 69366165033600.0, + "grad_norm": 0.7784422243866626, + "language_loss": 0.61264527, + "learning_rate": 2.5565308523155245e-06, + "loss": 0.63338685, + "num_input_tokens_seen": 76726650, + "step": 3559, + "time_per_iteration": 2.9806787967681885 + }, + { + "auxiliary_loss_clip": 0.01123368, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.04993415, + "balance_loss_mlp": 1.02137637, + "epoch": 0.42806469067516384, + "flos": 18215328481920.0, + "grad_norm": 2.2434340829085153, + "language_loss": 0.81738192, + "learning_rate": 2.5557826045689336e-06, + "loss": 0.838911, + "num_input_tokens_seen": 76742890, + "step": 3560, + "time_per_iteration": 2.5365028381347656 + }, + { + "auxiliary_loss_clip": 0.01050918, + "auxiliary_loss_mlp": 0.01005069, + "balance_loss_clip": 1.02442741, + "balance_loss_mlp": 1.00351954, + "epoch": 0.4281849335658029, + "flos": 54535814432640.0, + "grad_norm": 0.818732295499881, + "language_loss": 0.58867776, + "learning_rate": 2.5550342725105643e-06, + "loss": 0.60923767, + "num_input_tokens_seen": 76801055, + "step": 3561, + "time_per_iteration": 3.0466389656066895 + }, + { + "auxiliary_loss_clip": 0.0117374, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.0586741, + "balance_loss_mlp": 1.02611387, + "epoch": 0.428305176456442, + "flos": 17274828723840.0, + "grad_norm": 1.9251193512602596, + "language_loss": 0.81000805, + "learning_rate": 2.554285856253937e-06, + "loss": 0.83208698, + "num_input_tokens_seen": 76819890, + "step": 3562, + "time_per_iteration": 3.329968214035034 + }, + { + "auxiliary_loss_clip": 0.01154903, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.05482984, + "balance_loss_mlp": 1.02243209, + "epoch": 0.4284254193470811, + "flos": 26359509749760.0, + "grad_norm": 2.103650819117505, + "language_loss": 0.77569038, + "learning_rate": 2.5535373559125855e-06, + "loss": 0.79754472, + "num_input_tokens_seen": 76840255, + "step": 3563, + "time_per_iteration": 2.5613696575164795 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01024007, + "balance_loss_clip": 1.04678929, + "balance_loss_mlp": 1.01523948, + "epoch": 0.42854566223772017, + "flos": 29714248379520.0, + "grad_norm": 1.80926251509629, + "language_loss": 0.81813461, + "learning_rate": 2.552788771600057e-06, + "loss": 0.83939862, + "num_input_tokens_seen": 76860565, + "step": 3564, + "time_per_iteration": 2.8759994506835938 + }, + { + "auxiliary_loss_clip": 0.01146431, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.05467808, + "balance_loss_mlp": 1.02420259, + "epoch": 0.4286659051283593, + "flos": 22018161277440.0, + "grad_norm": 2.0687220530542243, + "language_loss": 0.82019293, + "learning_rate": 2.5520401034299118e-06, + "loss": 0.84198427, + "num_input_tokens_seen": 76878325, + "step": 3565, + "time_per_iteration": 2.961338758468628 + }, + { + "auxiliary_loss_clip": 0.01175253, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.05667877, + "balance_loss_mlp": 1.02370882, + "epoch": 0.4287861480189984, + "flos": 13334422838400.0, + "grad_norm": 2.2319618979316096, + "language_loss": 0.87335134, + "learning_rate": 2.551291351515722e-06, + "loss": 0.89542651, + "num_input_tokens_seen": 76895340, + "step": 3566, + "time_per_iteration": 3.603695869445801 + }, + { + "auxiliary_loss_clip": 0.01136179, + "auxiliary_loss_mlp": 0.00763665, + "balance_loss_clip": 1.04632497, + "balance_loss_mlp": 1.00047851, + "epoch": 0.42890639090963745, + "flos": 26651535321600.0, + "grad_norm": 1.6067015940367897, + "language_loss": 0.85390478, + "learning_rate": 2.5505425159710726e-06, + "loss": 0.87290323, + "num_input_tokens_seen": 76915150, + "step": 3567, + "time_per_iteration": 2.674638509750366 + }, + { + "auxiliary_loss_clip": 0.01163183, + "auxiliary_loss_mlp": 0.00763388, + "balance_loss_clip": 1.05157232, + "balance_loss_mlp": 1.00046396, + "epoch": 0.42902663380027656, + "flos": 24055768091520.0, + "grad_norm": 3.9916115142268445, + "language_loss": 0.83194745, + "learning_rate": 2.549793596909561e-06, + "loss": 0.85121316, + "num_input_tokens_seen": 76933770, + "step": 3568, + "time_per_iteration": 2.529055118560791 + }, + { + "auxiliary_loss_clip": 0.01154539, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.05505311, + "balance_loss_mlp": 1.02096212, + "epoch": 0.42914687669091567, + "flos": 15632561975040.0, + "grad_norm": 1.9891471433547236, + "language_loss": 0.65879661, + "learning_rate": 2.5490445944447976e-06, + "loss": 0.68063486, + "num_input_tokens_seen": 76952265, + "step": 3569, + "time_per_iteration": 3.33054518699646 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.05364561, + "balance_loss_mlp": 1.01987469, + "epoch": 0.4292671195815547, + "flos": 31467802440960.0, + "grad_norm": 2.4909204966518628, + "language_loss": 0.65166163, + "learning_rate": 2.548295508690406e-06, + "loss": 0.67363918, + "num_input_tokens_seen": 76973560, + "step": 3570, + "time_per_iteration": 3.2226977348327637 + }, + { + "auxiliary_loss_clip": 0.01173847, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.05355704, + "balance_loss_mlp": 1.01922894, + "epoch": 0.42938736247219383, + "flos": 30257756046720.0, + "grad_norm": 1.6986809529245595, + "language_loss": 0.76484245, + "learning_rate": 2.5475463397600217e-06, + "loss": 0.78685689, + "num_input_tokens_seen": 76993640, + "step": 3571, + "time_per_iteration": 2.5285048484802246 + }, + { + "auxiliary_loss_clip": 0.01191717, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.05914819, + "balance_loss_mlp": 1.02160597, + "epoch": 0.42950760536283294, + "flos": 29349683291520.0, + "grad_norm": 2.443079276247546, + "language_loss": 0.77560741, + "learning_rate": 2.546797087767293e-06, + "loss": 0.79782391, + "num_input_tokens_seen": 77013765, + "step": 3572, + "time_per_iteration": 2.487550735473633 + }, + { + "auxiliary_loss_clip": 0.01124443, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.04902542, + "balance_loss_mlp": 1.02599955, + "epoch": 0.429627848253472, + "flos": 26869943969280.0, + "grad_norm": 1.8032635849306458, + "language_loss": 0.87161362, + "learning_rate": 2.546047752825881e-06, + "loss": 0.89319885, + "num_input_tokens_seen": 77034370, + "step": 3573, + "time_per_iteration": 2.6012799739837646 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.05017209, + "balance_loss_mlp": 1.02296543, + "epoch": 0.4297480911441111, + "flos": 13881270470400.0, + "grad_norm": 2.242408211206307, + "language_loss": 0.93161988, + "learning_rate": 2.5452983350494595e-06, + "loss": 0.95325571, + "num_input_tokens_seen": 77049925, + "step": 3574, + "time_per_iteration": 2.532256603240967 + }, + { + "auxiliary_loss_clip": 0.01170554, + "auxiliary_loss_mlp": 0.00763107, + "balance_loss_clip": 1.05412567, + "balance_loss_mlp": 1.00056624, + "epoch": 0.4298683340347502, + "flos": 20741141975040.0, + "grad_norm": 2.1450186344560365, + "language_loss": 0.65653044, + "learning_rate": 2.544548834551713e-06, + "loss": 0.67586708, + "num_input_tokens_seen": 77068930, + "step": 3575, + "time_per_iteration": 2.500807762145996 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.00763196, + "balance_loss_clip": 1.05166554, + "balance_loss_mlp": 1.00045967, + "epoch": 0.4299885769253893, + "flos": 20882126856960.0, + "grad_norm": 2.4004906846527336, + "language_loss": 0.94127178, + "learning_rate": 2.5437992514463424e-06, + "loss": 0.9602949, + "num_input_tokens_seen": 77082255, + "step": 3576, + "time_per_iteration": 2.515387773513794 + }, + { + "auxiliary_loss_clip": 0.01174261, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.05637813, + "balance_loss_mlp": 1.01946259, + "epoch": 0.4301088198160284, + "flos": 25484618183040.0, + "grad_norm": 1.7281665072426735, + "language_loss": 0.87893593, + "learning_rate": 2.5430495858470565e-06, + "loss": 0.90095901, + "num_input_tokens_seen": 77101725, + "step": 3577, + "time_per_iteration": 2.4934825897216797 + }, + { + "auxiliary_loss_clip": 0.01169641, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.05549073, + "balance_loss_mlp": 1.0241704, + "epoch": 0.43022906270666744, + "flos": 18259427404800.0, + "grad_norm": 2.78896037698433, + "language_loss": 0.77113187, + "learning_rate": 2.54229983786758e-06, + "loss": 0.7931509, + "num_input_tokens_seen": 77119670, + "step": 3578, + "time_per_iteration": 2.425658702850342 + }, + { + "auxiliary_loss_clip": 0.01156303, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.05145121, + "balance_loss_mlp": 1.02094698, + "epoch": 0.43034930559730655, + "flos": 23399536567680.0, + "grad_norm": 1.8501987499236685, + "language_loss": 0.85085428, + "learning_rate": 2.541550007621651e-06, + "loss": 0.87271035, + "num_input_tokens_seen": 77138160, + "step": 3579, + "time_per_iteration": 2.4952118396759033 + }, + { + "auxiliary_loss_clip": 0.01171266, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.05745423, + "balance_loss_mlp": 1.02094483, + "epoch": 0.43046954848794566, + "flos": 28184382264960.0, + "grad_norm": 2.139920714655677, + "language_loss": 0.79701084, + "learning_rate": 2.5408000952230156e-06, + "loss": 0.81901133, + "num_input_tokens_seen": 77156950, + "step": 3580, + "time_per_iteration": 2.5249781608581543 + }, + { + "auxiliary_loss_clip": 0.01152759, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.05095124, + "balance_loss_mlp": 1.01902294, + "epoch": 0.4305897913785847, + "flos": 28580476515840.0, + "grad_norm": 2.0773310101012012, + "language_loss": 0.90524906, + "learning_rate": 2.5400501007854357e-06, + "loss": 0.92705309, + "num_input_tokens_seen": 77176395, + "step": 3581, + "time_per_iteration": 2.5997626781463623 + }, + { + "auxiliary_loss_clip": 0.01126468, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.04613614, + "balance_loss_mlp": 1.02560103, + "epoch": 0.43071003426922383, + "flos": 20448721353600.0, + "grad_norm": 1.761173417858187, + "language_loss": 0.75274009, + "learning_rate": 2.539300024422685e-06, + "loss": 0.77433741, + "num_input_tokens_seen": 77194340, + "step": 3582, + "time_per_iteration": 2.556225299835205 + }, + { + "auxiliary_loss_clip": 0.01050716, + "auxiliary_loss_mlp": 0.01003189, + "balance_loss_clip": 1.01982164, + "balance_loss_mlp": 1.00197935, + "epoch": 0.43083027715986294, + "flos": 51997969883520.0, + "grad_norm": 0.7906574745758691, + "language_loss": 0.60928702, + "learning_rate": 2.538549866248549e-06, + "loss": 0.62982607, + "num_input_tokens_seen": 77249320, + "step": 3583, + "time_per_iteration": 2.9549360275268555 + }, + { + "auxiliary_loss_clip": 0.01173343, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.05350137, + "balance_loss_mlp": 1.01874781, + "epoch": 0.430950520050502, + "flos": 16690885320960.0, + "grad_norm": 2.3415440895236714, + "language_loss": 0.80943739, + "learning_rate": 2.5377996263768274e-06, + "loss": 0.83144534, + "num_input_tokens_seen": 77267400, + "step": 3584, + "time_per_iteration": 2.446281671524048 + }, + { + "auxiliary_loss_clip": 0.01156413, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.04978085, + "balance_loss_mlp": 1.02570295, + "epoch": 0.4310707629411411, + "flos": 24608433726720.0, + "grad_norm": 1.7468793875230286, + "language_loss": 0.68138158, + "learning_rate": 2.5370493049213293e-06, + "loss": 0.70328331, + "num_input_tokens_seen": 77287045, + "step": 3585, + "time_per_iteration": 2.524519205093384 + }, + { + "auxiliary_loss_clip": 0.010918, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.04665244, + "balance_loss_mlp": 1.02145815, + "epoch": 0.4311910058317802, + "flos": 26432983019520.0, + "grad_norm": 2.0215579887339024, + "language_loss": 0.80402058, + "learning_rate": 2.536298901995878e-06, + "loss": 0.82524341, + "num_input_tokens_seen": 77306255, + "step": 3586, + "time_per_iteration": 2.703721046447754 + }, + { + "auxiliary_loss_clip": 0.01160498, + "auxiliary_loss_mlp": 0.01027067, + "balance_loss_clip": 1.05462074, + "balance_loss_mlp": 1.0187639, + "epoch": 0.43131124872241927, + "flos": 25155891889920.0, + "grad_norm": 1.614446675531778, + "language_loss": 0.80473924, + "learning_rate": 2.535548417714311e-06, + "loss": 0.82661486, + "num_input_tokens_seen": 77325555, + "step": 3587, + "time_per_iteration": 2.740952730178833 + }, + { + "auxiliary_loss_clip": 0.01178517, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.05543554, + "balance_loss_mlp": 1.02067208, + "epoch": 0.4314314916130584, + "flos": 21614812479360.0, + "grad_norm": 1.533642986622583, + "language_loss": 0.86980295, + "learning_rate": 2.534797852190474e-06, + "loss": 0.89188343, + "num_input_tokens_seen": 77345735, + "step": 3588, + "time_per_iteration": 3.4060065746307373 + }, + { + "auxiliary_loss_clip": 0.01169581, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.05319691, + "balance_loss_mlp": 1.02904487, + "epoch": 0.4315517345036975, + "flos": 19275016544640.0, + "grad_norm": 2.009137048748634, + "language_loss": 0.81502658, + "learning_rate": 2.5340472055382283e-06, + "loss": 0.83710051, + "num_input_tokens_seen": 77361765, + "step": 3589, + "time_per_iteration": 2.4752395153045654 + }, + { + "auxiliary_loss_clip": 0.01139034, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.04677331, + "balance_loss_mlp": 1.01827312, + "epoch": 0.43167197739433655, + "flos": 24273853516800.0, + "grad_norm": 2.8294804541610383, + "language_loss": 0.81204522, + "learning_rate": 2.5332964778714468e-06, + "loss": 0.83369684, + "num_input_tokens_seen": 77378950, + "step": 3590, + "time_per_iteration": 2.5702152252197266 + }, + { + "auxiliary_loss_clip": 0.0114217, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.05348587, + "balance_loss_mlp": 1.0186739, + "epoch": 0.43179222028497566, + "flos": 16867816738560.0, + "grad_norm": 1.5957164668142962, + "language_loss": 0.66312826, + "learning_rate": 2.5325456693040123e-06, + "loss": 0.68481266, + "num_input_tokens_seen": 77396145, + "step": 3591, + "time_per_iteration": 2.5571916103363037 + }, + { + "auxiliary_loss_clip": 0.01179218, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.05383968, + "balance_loss_mlp": 1.01914167, + "epoch": 0.43191246317561477, + "flos": 17639214243840.0, + "grad_norm": 2.2300435099828704, + "language_loss": 0.74680722, + "learning_rate": 2.531794779949824e-06, + "loss": 0.76887643, + "num_input_tokens_seen": 77414045, + "step": 3592, + "time_per_iteration": 3.300826072692871 + }, + { + "auxiliary_loss_clip": 0.01134499, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.04850733, + "balance_loss_mlp": 1.01959825, + "epoch": 0.4320327060662538, + "flos": 23878800760320.0, + "grad_norm": 1.6716766956956526, + "language_loss": 0.87860763, + "learning_rate": 2.5310438099227903e-06, + "loss": 0.90022922, + "num_input_tokens_seen": 77431310, + "step": 3593, + "time_per_iteration": 2.5510101318359375 + }, + { + "auxiliary_loss_clip": 0.01072462, + "auxiliary_loss_mlp": 0.01001891, + "balance_loss_clip": 1.01909506, + "balance_loss_mlp": 1.00081182, + "epoch": 0.43215294895689293, + "flos": 66394917959040.0, + "grad_norm": 0.9191302928058255, + "language_loss": 0.53381926, + "learning_rate": 2.530292759336833e-06, + "loss": 0.55456281, + "num_input_tokens_seen": 77492045, + "step": 3594, + "time_per_iteration": 3.1054184436798096 + }, + { + "auxiliary_loss_clip": 0.01157345, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.05487216, + "balance_loss_mlp": 1.01938283, + "epoch": 0.432273191847532, + "flos": 20594267262720.0, + "grad_norm": 2.3427555431985314, + "language_loss": 0.69422531, + "learning_rate": 2.5295416283058855e-06, + "loss": 0.71608031, + "num_input_tokens_seen": 77510910, + "step": 3595, + "time_per_iteration": 2.497631311416626 + }, + { + "auxiliary_loss_clip": 0.01153369, + "auxiliary_loss_mlp": 0.00762782, + "balance_loss_clip": 1.05268729, + "balance_loss_mlp": 1.00050402, + "epoch": 0.4323934347381711, + "flos": 19282127437440.0, + "grad_norm": 1.5675448187427266, + "language_loss": 0.65933645, + "learning_rate": 2.5287904169438943e-06, + "loss": 0.67849797, + "num_input_tokens_seen": 77530115, + "step": 3596, + "time_per_iteration": 4.03898024559021 + }, + { + "auxiliary_loss_clip": 0.01110511, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.05006278, + "balance_loss_mlp": 1.02889407, + "epoch": 0.4325136776288102, + "flos": 21726315273600.0, + "grad_norm": 3.051767216861184, + "language_loss": 0.64282525, + "learning_rate": 2.528039125364817e-06, + "loss": 0.66431338, + "num_input_tokens_seen": 77548920, + "step": 3597, + "time_per_iteration": 2.6449201107025146 + }, + { + "auxiliary_loss_clip": 0.01145312, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.05067301, + "balance_loss_mlp": 1.02229559, + "epoch": 0.43263392051944927, + "flos": 22340746344960.0, + "grad_norm": 2.246116252924554, + "language_loss": 0.75853992, + "learning_rate": 2.5272877536826246e-06, + "loss": 0.78030181, + "num_input_tokens_seen": 77567715, + "step": 3598, + "time_per_iteration": 2.5189919471740723 + }, + { + "auxiliary_loss_clip": 0.0113151, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.04659212, + "balance_loss_mlp": 1.02204812, + "epoch": 0.4327541634100884, + "flos": 29168406328320.0, + "grad_norm": 2.3783946157873723, + "language_loss": 0.70116448, + "learning_rate": 2.5265363020112986e-06, + "loss": 0.72278833, + "num_input_tokens_seen": 77588035, + "step": 3599, + "time_per_iteration": 2.6355364322662354 + }, + { + "auxiliary_loss_clip": 0.01172324, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.05613291, + "balance_loss_mlp": 1.02587986, + "epoch": 0.4328744063007275, + "flos": 26067448264320.0, + "grad_norm": 3.323915847623867, + "language_loss": 0.83648968, + "learning_rate": 2.5257847704648344e-06, + "loss": 0.85856009, + "num_input_tokens_seen": 77609265, + "step": 3600, + "time_per_iteration": 2.5086514949798584 + }, + { + "auxiliary_loss_clip": 0.01184066, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.05506432, + "balance_loss_mlp": 1.02017796, + "epoch": 0.43299464919136654, + "flos": 16581357774720.0, + "grad_norm": 1.9252807626380084, + "language_loss": 0.75428838, + "learning_rate": 2.525033159157239e-06, + "loss": 0.77640957, + "num_input_tokens_seen": 77625580, + "step": 3601, + "time_per_iteration": 2.3959898948669434 + }, + { + "auxiliary_loss_clip": 0.01169547, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.05401242, + "balance_loss_mlp": 1.02986491, + "epoch": 0.43311489208200565, + "flos": 16107265140480.0, + "grad_norm": 2.006134952642327, + "language_loss": 0.77326339, + "learning_rate": 2.52428146820253e-06, + "loss": 0.79535079, + "num_input_tokens_seen": 77643835, + "step": 3602, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01146354, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.05238581, + "balance_loss_mlp": 1.02189553, + "epoch": 0.43323513497264476, + "flos": 22930220442240.0, + "grad_norm": 1.7307430856872235, + "language_loss": 0.81849921, + "learning_rate": 2.52352969771474e-06, + "loss": 0.84027946, + "num_input_tokens_seen": 77663060, + "step": 3603, + "time_per_iteration": 2.5484559535980225 + }, + { + "auxiliary_loss_clip": 0.01159581, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.05276, + "balance_loss_mlp": 1.02141619, + "epoch": 0.4333553778632838, + "flos": 25299031587840.0, + "grad_norm": 2.356006697098909, + "language_loss": 0.88193458, + "learning_rate": 2.5227778478079106e-06, + "loss": 0.90382659, + "num_input_tokens_seen": 77682470, + "step": 3604, + "time_per_iteration": 2.5340633392333984 + }, + { + "auxiliary_loss_clip": 0.01167145, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.05174792, + "balance_loss_mlp": 1.02540648, + "epoch": 0.43347562075392293, + "flos": 19387165783680.0, + "grad_norm": 1.5783330184256474, + "language_loss": 0.76814699, + "learning_rate": 2.522025918596098e-06, + "loss": 0.79015219, + "num_input_tokens_seen": 77700770, + "step": 3605, + "time_per_iteration": 2.449148654937744 + }, + { + "auxiliary_loss_clip": 0.01173961, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.05479336, + "balance_loss_mlp": 1.02148247, + "epoch": 0.43359586364456204, + "flos": 26325969425280.0, + "grad_norm": 2.181342040543769, + "language_loss": 0.65379238, + "learning_rate": 2.521273910193368e-06, + "loss": 0.67582834, + "num_input_tokens_seen": 77723950, + "step": 3606, + "time_per_iteration": 2.523129940032959 + }, + { + "auxiliary_loss_clip": 0.01178969, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.05606961, + "balance_loss_mlp": 1.01939464, + "epoch": 0.4337161065352011, + "flos": 15989261984640.0, + "grad_norm": 2.609366644306554, + "language_loss": 0.87145531, + "learning_rate": 2.5205218227138006e-06, + "loss": 0.89352375, + "num_input_tokens_seen": 77736905, + "step": 3607, + "time_per_iteration": 2.421447277069092 + }, + { + "auxiliary_loss_clip": 0.0118613, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.05574131, + "balance_loss_mlp": 1.01929164, + "epoch": 0.4338363494258402, + "flos": 20224710184320.0, + "grad_norm": 2.258393103108662, + "language_loss": 0.79083574, + "learning_rate": 2.519769656271486e-06, + "loss": 0.81297284, + "num_input_tokens_seen": 77754325, + "step": 3608, + "time_per_iteration": 2.4304187297821045 + }, + { + "auxiliary_loss_clip": 0.01120083, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.0486685, + "balance_loss_mlp": 1.02252769, + "epoch": 0.43395659231647926, + "flos": 20083904870400.0, + "grad_norm": 2.317198853509111, + "language_loss": 0.67356873, + "learning_rate": 2.5190174109805285e-06, + "loss": 0.69508207, + "num_input_tokens_seen": 77774150, + "step": 3609, + "time_per_iteration": 2.546675443649292 + }, + { + "auxiliary_loss_clip": 0.01147647, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.04997301, + "balance_loss_mlp": 1.01796293, + "epoch": 0.43407683520711837, + "flos": 19901801894400.0, + "grad_norm": 2.10948214248333, + "language_loss": 0.64168036, + "learning_rate": 2.518265086955042e-06, + "loss": 0.66342616, + "num_input_tokens_seen": 77791870, + "step": 3610, + "time_per_iteration": 2.5092434883117676 + }, + { + "auxiliary_loss_clip": 0.01185956, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.05473149, + "balance_loss_mlp": 1.02936292, + "epoch": 0.4341970780977575, + "flos": 23108732058240.0, + "grad_norm": 1.6843623205361662, + "language_loss": 0.83678395, + "learning_rate": 2.5175126843091534e-06, + "loss": 0.85901845, + "num_input_tokens_seen": 77811240, + "step": 3611, + "time_per_iteration": 2.4476075172424316 + }, + { + "auxiliary_loss_clip": 0.01158178, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.05048442, + "balance_loss_mlp": 1.01560867, + "epoch": 0.43431732098839654, + "flos": 37408288406400.0, + "grad_norm": 2.1042005140210893, + "language_loss": 0.75145274, + "learning_rate": 2.5167602031570034e-06, + "loss": 0.77327222, + "num_input_tokens_seen": 77831425, + "step": 3612, + "time_per_iteration": 2.617098093032837 + }, + { + "auxiliary_loss_clip": 0.0118606, + "auxiliary_loss_mlp": 0.01025393, + "balance_loss_clip": 1.05638385, + "balance_loss_mlp": 1.01708436, + "epoch": 0.43443756387903565, + "flos": 31868206323840.0, + "grad_norm": 1.6016382545762038, + "language_loss": 0.73373705, + "learning_rate": 2.51600764361274e-06, + "loss": 0.75585163, + "num_input_tokens_seen": 77852950, + "step": 3613, + "time_per_iteration": 2.501828908920288 + }, + { + "auxiliary_loss_clip": 0.01188277, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.05649614, + "balance_loss_mlp": 1.01847661, + "epoch": 0.43455780676967476, + "flos": 23477139901440.0, + "grad_norm": 2.1886245207445425, + "language_loss": 0.78908598, + "learning_rate": 2.5152550057905283e-06, + "loss": 0.81123483, + "num_input_tokens_seen": 77872840, + "step": 3614, + "time_per_iteration": 2.4389216899871826 + }, + { + "auxiliary_loss_clip": 0.01174111, + "auxiliary_loss_mlp": 0.0076366, + "balance_loss_clip": 1.05603015, + "balance_loss_mlp": 1.00063324, + "epoch": 0.4346780496603138, + "flos": 24207060176640.0, + "grad_norm": 2.2492235974098054, + "language_loss": 0.76705825, + "learning_rate": 2.5145022898045415e-06, + "loss": 0.78643596, + "num_input_tokens_seen": 77892025, + "step": 3615, + "time_per_iteration": 3.308350086212158 + }, + { + "auxiliary_loss_clip": 0.01159266, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.04948032, + "balance_loss_mlp": 1.02433205, + "epoch": 0.4347982925509529, + "flos": 17092366611840.0, + "grad_norm": 2.026332633776362, + "language_loss": 0.89337057, + "learning_rate": 2.5137494957689664e-06, + "loss": 0.91529644, + "num_input_tokens_seen": 77907635, + "step": 3616, + "time_per_iteration": 2.4904677867889404 + }, + { + "auxiliary_loss_clip": 0.01061576, + "auxiliary_loss_mlp": 0.01005692, + "balance_loss_clip": 1.01835418, + "balance_loss_mlp": 1.00462496, + "epoch": 0.43491853544159204, + "flos": 60945544696320.0, + "grad_norm": 0.7613973989573046, + "language_loss": 0.5735485, + "learning_rate": 2.5129966237980016e-06, + "loss": 0.59422117, + "num_input_tokens_seen": 77970630, + "step": 3617, + "time_per_iteration": 3.0938665866851807 + }, + { + "auxiliary_loss_clip": 0.01142707, + "auxiliary_loss_mlp": 0.01025509, + "balance_loss_clip": 1.04907811, + "balance_loss_mlp": 1.017272, + "epoch": 0.4350387783322311, + "flos": 21944652094080.0, + "grad_norm": 1.8988439242893407, + "language_loss": 0.77993286, + "learning_rate": 2.512243674005857e-06, + "loss": 0.80161512, + "num_input_tokens_seen": 77989995, + "step": 3618, + "time_per_iteration": 2.55068302154541 + }, + { + "auxiliary_loss_clip": 0.01113454, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.048311, + "balance_loss_mlp": 1.02201319, + "epoch": 0.4351590212228702, + "flos": 25082705928960.0, + "grad_norm": 1.7363392803079163, + "language_loss": 0.85896677, + "learning_rate": 2.5114906465067537e-06, + "loss": 0.88040829, + "num_input_tokens_seen": 78010980, + "step": 3619, + "time_per_iteration": 3.4609029293060303 + }, + { + "auxiliary_loss_clip": 0.01172403, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.05167282, + "balance_loss_mlp": 1.01766515, + "epoch": 0.4352792641135093, + "flos": 21506541909120.0, + "grad_norm": 1.949125982000857, + "language_loss": 0.75005817, + "learning_rate": 2.5107375414149264e-06, + "loss": 0.77204478, + "num_input_tokens_seen": 78030225, + "step": 3620, + "time_per_iteration": 2.509326457977295 + }, + { + "auxiliary_loss_clip": 0.01118833, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.04273868, + "balance_loss_mlp": 1.02034283, + "epoch": 0.43539950700414837, + "flos": 16253457494400.0, + "grad_norm": 2.311843330396648, + "language_loss": 0.71927226, + "learning_rate": 2.5099843588446197e-06, + "loss": 0.74075323, + "num_input_tokens_seen": 78048545, + "step": 3621, + "time_per_iteration": 2.5363385677337646 + }, + { + "auxiliary_loss_clip": 0.01138254, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.05215633, + "balance_loss_mlp": 1.02281761, + "epoch": 0.4355197498947875, + "flos": 16691819074560.0, + "grad_norm": 1.6071659894416652, + "language_loss": 0.61504698, + "learning_rate": 2.509231098910091e-06, + "loss": 0.63674462, + "num_input_tokens_seen": 78068415, + "step": 3622, + "time_per_iteration": 3.354856252670288 + }, + { + "auxiliary_loss_clip": 0.01155929, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.0559597, + "balance_loss_mlp": 1.02075517, + "epoch": 0.4356399927854266, + "flos": 16362733645440.0, + "grad_norm": 2.2405377597638565, + "language_loss": 0.7461971, + "learning_rate": 2.508477761725611e-06, + "loss": 0.76805007, + "num_input_tokens_seen": 78086690, + "step": 3623, + "time_per_iteration": 3.242828607559204 + }, + { + "auxiliary_loss_clip": 0.01176179, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.05535257, + "balance_loss_mlp": 1.02109432, + "epoch": 0.43576023567606564, + "flos": 17202037812480.0, + "grad_norm": 2.402791545855353, + "language_loss": 0.80649513, + "learning_rate": 2.507724347405458e-06, + "loss": 0.82855362, + "num_input_tokens_seen": 78104640, + "step": 3624, + "time_per_iteration": 2.445854663848877 + }, + { + "auxiliary_loss_clip": 0.01120554, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.04393983, + "balance_loss_mlp": 1.02121329, + "epoch": 0.43588047856670475, + "flos": 15917656222080.0, + "grad_norm": 2.233436473490909, + "language_loss": 0.82006812, + "learning_rate": 2.5069708560639243e-06, + "loss": 0.84156996, + "num_input_tokens_seen": 78122550, + "step": 3625, + "time_per_iteration": 2.5374538898468018 + }, + { + "auxiliary_loss_clip": 0.01146066, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.05117929, + "balance_loss_mlp": 1.02008963, + "epoch": 0.4360007214573438, + "flos": 23659566099840.0, + "grad_norm": 2.705413983559013, + "language_loss": 0.61605096, + "learning_rate": 2.5062172878153158e-06, + "loss": 0.6378001, + "num_input_tokens_seen": 78141825, + "step": 3626, + "time_per_iteration": 2.5436885356903076 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01032089, + "balance_loss_clip": 1.05002499, + "balance_loss_mlp": 1.02250445, + "epoch": 0.4361209643479829, + "flos": 21978767036160.0, + "grad_norm": 1.9465188996827472, + "language_loss": 0.87297785, + "learning_rate": 2.505463642773947e-06, + "loss": 0.89454341, + "num_input_tokens_seen": 78161790, + "step": 3627, + "time_per_iteration": 2.6227526664733887 + }, + { + "auxiliary_loss_clip": 0.01144854, + "auxiliary_loss_mlp": 0.00763519, + "balance_loss_clip": 1.05143547, + "balance_loss_mlp": 1.00067449, + "epoch": 0.43624120723862203, + "flos": 17420159151360.0, + "grad_norm": 2.514219648593826, + "language_loss": 0.75091147, + "learning_rate": 2.504709921054146e-06, + "loss": 0.76999521, + "num_input_tokens_seen": 78178605, + "step": 3628, + "time_per_iteration": 2.509385824203491 + }, + { + "auxiliary_loss_clip": 0.01139182, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.04682803, + "balance_loss_mlp": 1.02279472, + "epoch": 0.4363614501292611, + "flos": 17895293280000.0, + "grad_norm": 4.513723805289859, + "language_loss": 0.83737111, + "learning_rate": 2.50395612277025e-06, + "loss": 0.85908252, + "num_input_tokens_seen": 78194460, + "step": 3629, + "time_per_iteration": 2.5140767097473145 + }, + { + "auxiliary_loss_clip": 0.01160723, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.0512743, + "balance_loss_mlp": 1.02008033, + "epoch": 0.4364816930199002, + "flos": 20302888135680.0, + "grad_norm": 2.1144942238942255, + "language_loss": 0.72881603, + "learning_rate": 2.503202248036612e-06, + "loss": 0.75070775, + "num_input_tokens_seen": 78213315, + "step": 3630, + "time_per_iteration": 2.5045201778411865 + }, + { + "auxiliary_loss_clip": 0.0118435, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.0543474, + "balance_loss_mlp": 1.02489638, + "epoch": 0.4366019359105393, + "flos": 24061334699520.0, + "grad_norm": 1.7372950643847136, + "language_loss": 0.73373204, + "learning_rate": 2.5024482969675927e-06, + "loss": 0.75591266, + "num_input_tokens_seen": 78233270, + "step": 3631, + "time_per_iteration": 2.45857310295105 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.04852903, + "balance_loss_mlp": 1.01764035, + "epoch": 0.43672217880117836, + "flos": 21754109422080.0, + "grad_norm": 2.022253891462564, + "language_loss": 0.84468782, + "learning_rate": 2.501694269677566e-06, + "loss": 0.86627543, + "num_input_tokens_seen": 78251040, + "step": 3632, + "time_per_iteration": 2.5529282093048096 + }, + { + "auxiliary_loss_clip": 0.01176267, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.05301142, + "balance_loss_mlp": 1.01802921, + "epoch": 0.4368424216918175, + "flos": 18035200753920.0, + "grad_norm": 1.9302069716994963, + "language_loss": 0.80720055, + "learning_rate": 2.500940166280918e-06, + "loss": 0.82922935, + "num_input_tokens_seen": 78269470, + "step": 3633, + "time_per_iteration": 2.469496011734009 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.05148768, + "balance_loss_mlp": 1.0229255, + "epoch": 0.4369626645824566, + "flos": 25447127362560.0, + "grad_norm": 1.9898720563025336, + "language_loss": 0.79129708, + "learning_rate": 2.500185986892045e-06, + "loss": 0.81328511, + "num_input_tokens_seen": 78288955, + "step": 3634, + "time_per_iteration": 2.5015029907226562 + }, + { + "auxiliary_loss_clip": 0.0116851, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.05206907, + "balance_loss_mlp": 1.02424169, + "epoch": 0.43708290747309564, + "flos": 25302694775040.0, + "grad_norm": 2.160090104061873, + "language_loss": 0.77396762, + "learning_rate": 2.499431731625355e-06, + "loss": 0.79598045, + "num_input_tokens_seen": 78307980, + "step": 3635, + "time_per_iteration": 2.495213031768799 + }, + { + "auxiliary_loss_clip": 0.011879, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.05529296, + "balance_loss_mlp": 1.01998973, + "epoch": 0.43720315036373475, + "flos": 31575103344000.0, + "grad_norm": 1.8733933394012074, + "language_loss": 0.79531556, + "learning_rate": 2.4986774005952686e-06, + "loss": 0.81748867, + "num_input_tokens_seen": 78330355, + "step": 3636, + "time_per_iteration": 2.529128074645996 + }, + { + "auxiliary_loss_clip": 0.01171313, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.05647755, + "balance_loss_mlp": 1.02175498, + "epoch": 0.43732339325437386, + "flos": 23112000195840.0, + "grad_norm": 1.9465508311630513, + "language_loss": 0.84558129, + "learning_rate": 2.4979229939162166e-06, + "loss": 0.86759365, + "num_input_tokens_seen": 78349135, + "step": 3637, + "time_per_iteration": 2.4494831562042236 + }, + { + "auxiliary_loss_clip": 0.01168537, + "auxiliary_loss_mlp": 0.01023937, + "balance_loss_clip": 1.05422688, + "balance_loss_mlp": 1.01566434, + "epoch": 0.4374436361450129, + "flos": 27746272080000.0, + "grad_norm": 1.6012624121032137, + "language_loss": 0.80400336, + "learning_rate": 2.4971685117026433e-06, + "loss": 0.82592809, + "num_input_tokens_seen": 78368900, + "step": 3638, + "time_per_iteration": 2.498469114303589 + }, + { + "auxiliary_loss_clip": 0.01172505, + "auxiliary_loss_mlp": 0.01026223, + "balance_loss_clip": 1.05378866, + "balance_loss_mlp": 1.01771784, + "epoch": 0.437563879035652, + "flos": 24172370616960.0, + "grad_norm": 1.467348615614611, + "language_loss": 0.76309681, + "learning_rate": 2.4964139540690018e-06, + "loss": 0.78508413, + "num_input_tokens_seen": 78392235, + "step": 3639, + "time_per_iteration": 2.51043701171875 + }, + { + "auxiliary_loss_clip": 0.01145479, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0515523, + "balance_loss_mlp": 1.02043033, + "epoch": 0.4376841219262911, + "flos": 23477211728640.0, + "grad_norm": 2.1019965662682916, + "language_loss": 0.72508317, + "learning_rate": 2.495659321129758e-06, + "loss": 0.74683046, + "num_input_tokens_seen": 78409980, + "step": 3640, + "time_per_iteration": 2.566718578338623 + }, + { + "auxiliary_loss_clip": 0.01166287, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.04983711, + "balance_loss_mlp": 1.02836776, + "epoch": 0.4378043648169302, + "flos": 25447809720960.0, + "grad_norm": 1.7008280931151147, + "language_loss": 0.75023711, + "learning_rate": 2.494904612999389e-06, + "loss": 0.77226526, + "num_input_tokens_seen": 78428690, + "step": 3641, + "time_per_iteration": 2.5219779014587402 + }, + { + "auxiliary_loss_clip": 0.01067198, + "auxiliary_loss_mlp": 0.01003329, + "balance_loss_clip": 1.0167594, + "balance_loss_mlp": 1.00212479, + "epoch": 0.4379246077075693, + "flos": 53914056986880.0, + "grad_norm": 0.748602284354078, + "language_loss": 0.56544054, + "learning_rate": 2.4941498297923843e-06, + "loss": 0.58614576, + "num_input_tokens_seen": 78489260, + "step": 3642, + "time_per_iteration": 3.7832112312316895 + }, + { + "auxiliary_loss_clip": 0.0117018, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.05351484, + "balance_loss_mlp": 1.01771307, + "epoch": 0.43804485059820836, + "flos": 20588305605120.0, + "grad_norm": 1.9664160691605947, + "language_loss": 0.6983552, + "learning_rate": 2.4933949716232424e-06, + "loss": 0.72031558, + "num_input_tokens_seen": 78506785, + "step": 3643, + "time_per_iteration": 2.4410908222198486 + }, + { + "auxiliary_loss_clip": 0.01142508, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.05175757, + "balance_loss_mlp": 1.02110696, + "epoch": 0.43816509348884747, + "flos": 23876214981120.0, + "grad_norm": 2.329121432420207, + "language_loss": 0.73704511, + "learning_rate": 2.492640038606476e-06, + "loss": 0.75877076, + "num_input_tokens_seen": 78525150, + "step": 3644, + "time_per_iteration": 2.539618730545044 + }, + { + "auxiliary_loss_clip": 0.0117145, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.05210304, + "balance_loss_mlp": 1.02188373, + "epoch": 0.4382853363794866, + "flos": 14684448533760.0, + "grad_norm": 1.9267487199619864, + "language_loss": 0.78487146, + "learning_rate": 2.491885030856608e-06, + "loss": 0.80689198, + "num_input_tokens_seen": 78543245, + "step": 3645, + "time_per_iteration": 2.428431272506714 + }, + { + "auxiliary_loss_clip": 0.01160944, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.05413651, + "balance_loss_mlp": 1.02057326, + "epoch": 0.43840557927012563, + "flos": 17165301177600.0, + "grad_norm": 2.085209038771853, + "language_loss": 0.82645285, + "learning_rate": 2.4911299484881713e-06, + "loss": 0.84834945, + "num_input_tokens_seen": 78560775, + "step": 3646, + "time_per_iteration": 3.336498498916626 + }, + { + "auxiliary_loss_clip": 0.01150848, + "auxiliary_loss_mlp": 0.01025502, + "balance_loss_clip": 1.04993951, + "balance_loss_mlp": 1.01747894, + "epoch": 0.43852582216076474, + "flos": 19390685316480.0, + "grad_norm": 5.4379245278726245, + "language_loss": 0.81367928, + "learning_rate": 2.490374791615712e-06, + "loss": 0.83544278, + "num_input_tokens_seen": 78580800, + "step": 3647, + "time_per_iteration": 2.5091891288757324 + }, + { + "auxiliary_loss_clip": 0.0119299, + "auxiliary_loss_mlp": 0.00763706, + "balance_loss_clip": 1.05729151, + "balance_loss_mlp": 1.00067329, + "epoch": 0.43864606505140386, + "flos": 18075133699200.0, + "grad_norm": 2.7574399281404096, + "language_loss": 0.77927428, + "learning_rate": 2.4896195603537867e-06, + "loss": 0.79884124, + "num_input_tokens_seen": 78595410, + "step": 3648, + "time_per_iteration": 2.397385358810425 + }, + { + "auxiliary_loss_clip": 0.01125462, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.05315149, + "balance_loss_mlp": 1.02045488, + "epoch": 0.4387663079420429, + "flos": 19644896845440.0, + "grad_norm": 2.0292940268414967, + "language_loss": 0.73760599, + "learning_rate": 2.488864254816964e-06, + "loss": 0.75914979, + "num_input_tokens_seen": 78614100, + "step": 3649, + "time_per_iteration": 3.2876951694488525 + }, + { + "auxiliary_loss_clip": 0.01174093, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.05545819, + "balance_loss_mlp": 1.02809286, + "epoch": 0.438886550832682, + "flos": 19719339782400.0, + "grad_norm": 6.459611713590826, + "language_loss": 0.6875475, + "learning_rate": 2.4881088751198218e-06, + "loss": 0.70965803, + "num_input_tokens_seen": 78632260, + "step": 3650, + "time_per_iteration": 3.2314679622650146 + }, + { + "auxiliary_loss_clip": 0.01160188, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.01964176, + "epoch": 0.43900679372332113, + "flos": 14536675981440.0, + "grad_norm": 2.725906032242637, + "language_loss": 0.64348853, + "learning_rate": 2.4873534213769517e-06, + "loss": 0.66537309, + "num_input_tokens_seen": 78647490, + "step": 3651, + "time_per_iteration": 2.4474642276763916 + }, + { + "auxiliary_loss_clip": 0.01139304, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.05315351, + "balance_loss_mlp": 1.02360094, + "epoch": 0.4391270366139602, + "flos": 24056234968320.0, + "grad_norm": 1.6682082602519543, + "language_loss": 0.71810389, + "learning_rate": 2.4865978937029547e-06, + "loss": 0.73981506, + "num_input_tokens_seen": 78666470, + "step": 3652, + "time_per_iteration": 2.5351741313934326 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.04930758, + "balance_loss_mlp": 1.0236547, + "epoch": 0.4392472795045993, + "flos": 31538510363520.0, + "grad_norm": 1.5912349278894915, + "language_loss": 0.6605354, + "learning_rate": 2.485842292212445e-06, + "loss": 0.68205631, + "num_input_tokens_seen": 78687685, + "step": 3653, + "time_per_iteration": 2.638922929763794 + }, + { + "auxiliary_loss_clip": 0.011882, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.05671668, + "balance_loss_mlp": 1.02149725, + "epoch": 0.4393675223952384, + "flos": 14866300114560.0, + "grad_norm": 1.9302416867016723, + "language_loss": 0.80380476, + "learning_rate": 2.485086617020045e-06, + "loss": 0.82598734, + "num_input_tokens_seen": 78706180, + "step": 3654, + "time_per_iteration": 2.4113941192626953 + }, + { + "auxiliary_loss_clip": 0.01149401, + "auxiliary_loss_mlp": 0.010245, + "balance_loss_clip": 1.04959059, + "balance_loss_mlp": 1.01587486, + "epoch": 0.43948776528587746, + "flos": 14825900292480.0, + "grad_norm": 2.1060905431072774, + "language_loss": 0.81650442, + "learning_rate": 2.4843308682403903e-06, + "loss": 0.83824337, + "num_input_tokens_seen": 78723095, + "step": 3655, + "time_per_iteration": 2.4563703536987305 + }, + { + "auxiliary_loss_clip": 0.01185694, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.05482495, + "balance_loss_mlp": 1.02050447, + "epoch": 0.4396080081765166, + "flos": 13914523486080.0, + "grad_norm": 1.771312483835851, + "language_loss": 0.82842958, + "learning_rate": 2.4835750459881294e-06, + "loss": 0.85057271, + "num_input_tokens_seen": 78739720, + "step": 3656, + "time_per_iteration": 2.3916516304016113 + }, + { + "auxiliary_loss_clip": 0.01149079, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.04897952, + "balance_loss_mlp": 1.0275346, + "epoch": 0.43972825106715563, + "flos": 18222978078720.0, + "grad_norm": 1.883646452895321, + "language_loss": 0.81660694, + "learning_rate": 2.4828191503779177e-06, + "loss": 0.83846641, + "num_input_tokens_seen": 78757820, + "step": 3657, + "time_per_iteration": 2.466453790664673 + }, + { + "auxiliary_loss_clip": 0.01142291, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.04992008, + "balance_loss_mlp": 1.01807773, + "epoch": 0.43984849395779474, + "flos": 16873239692160.0, + "grad_norm": 2.1980635967594306, + "language_loss": 0.89555985, + "learning_rate": 2.482063181524425e-06, + "loss": 0.9172495, + "num_input_tokens_seen": 78773720, + "step": 3658, + "time_per_iteration": 2.5018508434295654 + }, + { + "auxiliary_loss_clip": 0.01189443, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.0567528, + "balance_loss_mlp": 1.02955055, + "epoch": 0.43996873684843385, + "flos": 18691504104960.0, + "grad_norm": 2.2157591733945674, + "language_loss": 0.81577098, + "learning_rate": 2.4813071395423307e-06, + "loss": 0.83804834, + "num_input_tokens_seen": 78791285, + "step": 3659, + "time_per_iteration": 2.408621311187744 + }, + { + "auxiliary_loss_clip": 0.01173055, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.05435991, + "balance_loss_mlp": 1.0237174, + "epoch": 0.4400889797390729, + "flos": 23653460787840.0, + "grad_norm": 1.8011518847937302, + "language_loss": 0.64649594, + "learning_rate": 2.4805510245463263e-06, + "loss": 0.66855699, + "num_input_tokens_seen": 78811440, + "step": 3660, + "time_per_iteration": 2.487126588821411 + }, + { + "auxiliary_loss_clip": 0.01170902, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.05266976, + "balance_loss_mlp": 1.02436018, + "epoch": 0.440209222629712, + "flos": 23149203707520.0, + "grad_norm": 2.164366007879305, + "language_loss": 0.60653222, + "learning_rate": 2.4797948366511137e-06, + "loss": 0.62857521, + "num_input_tokens_seen": 78831150, + "step": 3661, + "time_per_iteration": 2.4744760990142822 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.04944253, + "balance_loss_mlp": 1.02641368, + "epoch": 0.4403294655203511, + "flos": 24823394668800.0, + "grad_norm": 2.3243586023971243, + "language_loss": 0.76402169, + "learning_rate": 2.4790385759714055e-06, + "loss": 0.7858156, + "num_input_tokens_seen": 78850215, + "step": 3662, + "time_per_iteration": 2.62807559967041 + }, + { + "auxiliary_loss_clip": 0.01171244, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.05690086, + "balance_loss_mlp": 1.02120781, + "epoch": 0.4404497084109902, + "flos": 22565080736640.0, + "grad_norm": 1.6277729916584753, + "language_loss": 0.71002841, + "learning_rate": 2.478282242621926e-06, + "loss": 0.73203778, + "num_input_tokens_seen": 78870675, + "step": 3663, + "time_per_iteration": 2.535227060317993 + }, + { + "auxiliary_loss_clip": 0.01051195, + "auxiliary_loss_mlp": 0.0100454, + "balance_loss_clip": 1.01939011, + "balance_loss_mlp": 1.00321126, + "epoch": 0.4405699513016293, + "flos": 64967073448320.0, + "grad_norm": 0.8409831315378775, + "language_loss": 0.59559989, + "learning_rate": 2.477525836717411e-06, + "loss": 0.61615723, + "num_input_tokens_seen": 78938440, + "step": 3664, + "time_per_iteration": 3.2285568714141846 + }, + { + "auxiliary_loss_clip": 0.01170862, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.05177999, + "balance_loss_mlp": 1.02216434, + "epoch": 0.4406901941922684, + "flos": 35661952978560.0, + "grad_norm": 2.331508199438346, + "language_loss": 0.79460138, + "learning_rate": 2.476769358372606e-06, + "loss": 0.81661516, + "num_input_tokens_seen": 78960090, + "step": 3665, + "time_per_iteration": 2.5901997089385986 + }, + { + "auxiliary_loss_clip": 0.01139117, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.05162907, + "balance_loss_mlp": 1.01894355, + "epoch": 0.44081043708290746, + "flos": 18040767361920.0, + "grad_norm": 3.17328299839823, + "language_loss": 0.75045919, + "learning_rate": 2.4760128077022683e-06, + "loss": 0.7721141, + "num_input_tokens_seen": 78978225, + "step": 3666, + "time_per_iteration": 2.565070629119873 + }, + { + "auxiliary_loss_clip": 0.01121405, + "auxiliary_loss_mlp": 0.01025168, + "balance_loss_clip": 1.05060518, + "balance_loss_mlp": 1.01710391, + "epoch": 0.44093067997354657, + "flos": 30153507799680.0, + "grad_norm": 1.682342936976867, + "language_loss": 0.6852901, + "learning_rate": 2.4752561848211672e-06, + "loss": 0.70675582, + "num_input_tokens_seen": 79000625, + "step": 3667, + "time_per_iteration": 2.6126208305358887 + }, + { + "auxiliary_loss_clip": 0.011731, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.05914748, + "balance_loss_mlp": 1.02599335, + "epoch": 0.4410509228641857, + "flos": 23255068066560.0, + "grad_norm": 1.7725359982161295, + "language_loss": 0.71227455, + "learning_rate": 2.4744994898440797e-06, + "loss": 0.73434925, + "num_input_tokens_seen": 79019415, + "step": 3668, + "time_per_iteration": 2.4678609371185303 + }, + { + "auxiliary_loss_clip": 0.0114822, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.05159271, + "balance_loss_mlp": 1.02711105, + "epoch": 0.44117116575482473, + "flos": 19500571998720.0, + "grad_norm": 1.958928770721048, + "language_loss": 0.83773434, + "learning_rate": 2.473742722885797e-06, + "loss": 0.85957688, + "num_input_tokens_seen": 79038435, + "step": 3669, + "time_per_iteration": 3.371425151824951 + }, + { + "auxiliary_loss_clip": 0.01176063, + "auxiliary_loss_mlp": 0.00763588, + "balance_loss_clip": 1.05887735, + "balance_loss_mlp": 1.00067163, + "epoch": 0.44129140864546385, + "flos": 27053124353280.0, + "grad_norm": 2.0819537096438743, + "language_loss": 0.65102518, + "learning_rate": 2.4729858840611197e-06, + "loss": 0.67042172, + "num_input_tokens_seen": 79057345, + "step": 3670, + "time_per_iteration": 2.549913167953491 + }, + { + "auxiliary_loss_clip": 0.01187426, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.05753946, + "balance_loss_mlp": 1.01855588, + "epoch": 0.4414116515361029, + "flos": 26102101910400.0, + "grad_norm": 4.809380728779214, + "language_loss": 0.72867262, + "learning_rate": 2.4722289734848605e-06, + "loss": 0.75081217, + "num_input_tokens_seen": 79077810, + "step": 3671, + "time_per_iteration": 2.4851484298706055 + }, + { + "auxiliary_loss_clip": 0.01143818, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.05617523, + "balance_loss_mlp": 1.02051973, + "epoch": 0.441531894426742, + "flos": 21906083865600.0, + "grad_norm": 2.0540421767820445, + "language_loss": 0.77928531, + "learning_rate": 2.471471991271841e-06, + "loss": 0.80101234, + "num_input_tokens_seen": 79094935, + "step": 3672, + "time_per_iteration": 3.3719658851623535 + }, + { + "auxiliary_loss_clip": 0.01164399, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.05276191, + "balance_loss_mlp": 1.01832795, + "epoch": 0.4416521373173811, + "flos": 23437099215360.0, + "grad_norm": 1.919010769417049, + "language_loss": 0.79262859, + "learning_rate": 2.470714937536896e-06, + "loss": 0.81454134, + "num_input_tokens_seen": 79113660, + "step": 3673, + "time_per_iteration": 2.470294237136841 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.0489316, + "balance_loss_mlp": 1.02226722, + "epoch": 0.4417723802080202, + "flos": 20334345471360.0, + "grad_norm": 1.661876865895167, + "language_loss": 0.70376062, + "learning_rate": 2.469957812394868e-06, + "loss": 0.72532082, + "num_input_tokens_seen": 79132470, + "step": 3674, + "time_per_iteration": 2.565774917602539 + }, + { + "auxiliary_loss_clip": 0.01186638, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.05809879, + "balance_loss_mlp": 1.02052379, + "epoch": 0.4418926230986593, + "flos": 18880682060160.0, + "grad_norm": 1.8807257481700441, + "language_loss": 0.76321942, + "learning_rate": 2.4692006159606148e-06, + "loss": 0.78537619, + "num_input_tokens_seen": 79150000, + "step": 3675, + "time_per_iteration": 2.4257280826568604 + }, + { + "auxiliary_loss_clip": 0.01185678, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.05552435, + "balance_loss_mlp": 1.02163005, + "epoch": 0.4420128659892984, + "flos": 19464409981440.0, + "grad_norm": 1.8759780191007915, + "language_loss": 0.78543627, + "learning_rate": 2.468443348349e-06, + "loss": 0.80759394, + "num_input_tokens_seen": 79167875, + "step": 3676, + "time_per_iteration": 3.1443326473236084 + }, + { + "auxiliary_loss_clip": 0.01127999, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.0489912, + "balance_loss_mlp": 1.02381372, + "epoch": 0.44213310887993745, + "flos": 17894359526400.0, + "grad_norm": 2.4823724357011616, + "language_loss": 0.82399458, + "learning_rate": 2.467686009674902e-06, + "loss": 0.84560943, + "num_input_tokens_seen": 79182325, + "step": 3677, + "time_per_iteration": 3.320093870162964 + }, + { + "auxiliary_loss_clip": 0.01166445, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.05101466, + "balance_loss_mlp": 1.02099323, + "epoch": 0.44225335177057656, + "flos": 19204667758080.0, + "grad_norm": 1.8654601966993722, + "language_loss": 0.85179412, + "learning_rate": 2.466928600053209e-06, + "loss": 0.87375748, + "num_input_tokens_seen": 79197630, + "step": 3678, + "time_per_iteration": 2.4332807064056396 + }, + { + "auxiliary_loss_clip": 0.0115601, + "auxiliary_loss_mlp": 0.01026681, + "balance_loss_clip": 1.05135798, + "balance_loss_mlp": 1.01846766, + "epoch": 0.4423735946612157, + "flos": 23471321898240.0, + "grad_norm": 1.8248486645074782, + "language_loss": 0.71393889, + "learning_rate": 2.466171119598818e-06, + "loss": 0.73576581, + "num_input_tokens_seen": 79217600, + "step": 3679, + "time_per_iteration": 2.5327563285827637 + }, + { + "auxiliary_loss_clip": 0.01177879, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.05263925, + "balance_loss_mlp": 1.02529597, + "epoch": 0.44249383755185473, + "flos": 26685398868480.0, + "grad_norm": 1.8872031919742798, + "language_loss": 0.77459937, + "learning_rate": 2.465413568426639e-06, + "loss": 0.79671788, + "num_input_tokens_seen": 79238550, + "step": 3680, + "time_per_iteration": 2.516883134841919 + }, + { + "auxiliary_loss_clip": 0.01167014, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.05362177, + "balance_loss_mlp": 1.01685166, + "epoch": 0.44261408044249384, + "flos": 23147659422720.0, + "grad_norm": 2.471646533675412, + "language_loss": 0.81122637, + "learning_rate": 2.464655946651591e-06, + "loss": 0.83314252, + "num_input_tokens_seen": 79257555, + "step": 3681, + "time_per_iteration": 2.476849317550659 + }, + { + "auxiliary_loss_clip": 0.01177036, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.0574224, + "balance_loss_mlp": 1.0207355, + "epoch": 0.44273432333313295, + "flos": 24462564595200.0, + "grad_norm": 2.0268041719936543, + "language_loss": 0.81043327, + "learning_rate": 2.4638982543886065e-06, + "loss": 0.83249509, + "num_input_tokens_seen": 79277595, + "step": 3682, + "time_per_iteration": 2.4976933002471924 + }, + { + "auxiliary_loss_clip": 0.01174802, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.0558691, + "balance_loss_mlp": 1.02689242, + "epoch": 0.442854566223772, + "flos": 17528932512000.0, + "grad_norm": 9.69083777776178, + "language_loss": 0.87227261, + "learning_rate": 2.4631404917526254e-06, + "loss": 0.89437413, + "num_input_tokens_seen": 79294550, + "step": 3683, + "time_per_iteration": 2.4609546661376953 + }, + { + "auxiliary_loss_clip": 0.01165195, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.05210638, + "balance_loss_mlp": 1.01943398, + "epoch": 0.4429748091144111, + "flos": 24896293320960.0, + "grad_norm": 1.6920251277605036, + "language_loss": 0.79115558, + "learning_rate": 2.4623826588586e-06, + "loss": 0.81308079, + "num_input_tokens_seen": 79314820, + "step": 3684, + "time_per_iteration": 2.491969108581543 + }, + { + "auxiliary_loss_clip": 0.01151848, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.04962802, + "balance_loss_mlp": 1.02289438, + "epoch": 0.4430950520050502, + "flos": 21614704738560.0, + "grad_norm": 1.7836736265672408, + "language_loss": 0.82626003, + "learning_rate": 2.461624755821492e-06, + "loss": 0.84809738, + "num_input_tokens_seen": 79334300, + "step": 3685, + "time_per_iteration": 2.506840229034424 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01024839, + "balance_loss_clip": 1.05109644, + "balance_loss_mlp": 1.01698887, + "epoch": 0.4432152948956893, + "flos": 24572271709440.0, + "grad_norm": 1.7522927979709753, + "language_loss": 0.76716673, + "learning_rate": 2.4608667827562763e-06, + "loss": 0.78884363, + "num_input_tokens_seen": 79353630, + "step": 3686, + "time_per_iteration": 2.5531423091888428 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.055722, + "balance_loss_mlp": 1.02314723, + "epoch": 0.4433355377863284, + "flos": 21762261809280.0, + "grad_norm": 2.12396996293802, + "language_loss": 0.89869851, + "learning_rate": 2.460108739777936e-06, + "loss": 0.92078197, + "num_input_tokens_seen": 79372765, + "step": 3687, + "time_per_iteration": 2.4826157093048096 + }, + { + "auxiliary_loss_clip": 0.01157357, + "auxiliary_loss_mlp": 0.01029924, + "balance_loss_clip": 1.0546813, + "balance_loss_mlp": 1.02138841, + "epoch": 0.44345578067696745, + "flos": 20084479488000.0, + "grad_norm": 1.5974800149121162, + "language_loss": 0.76480985, + "learning_rate": 2.4593506270014656e-06, + "loss": 0.78668267, + "num_input_tokens_seen": 79391735, + "step": 3688, + "time_per_iteration": 2.4870541095733643 + }, + { + "auxiliary_loss_clip": 0.01161638, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.05108809, + "balance_loss_mlp": 1.01942611, + "epoch": 0.44357602356760656, + "flos": 24169497528960.0, + "grad_norm": 1.6461131333415144, + "language_loss": 0.82080132, + "learning_rate": 2.45859244454187e-06, + "loss": 0.84269536, + "num_input_tokens_seen": 79411525, + "step": 3689, + "time_per_iteration": 2.550636053085327 + }, + { + "auxiliary_loss_clip": 0.01169545, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.05495358, + "balance_loss_mlp": 1.02015519, + "epoch": 0.44369626645824567, + "flos": 22707717644160.0, + "grad_norm": 1.741849472187036, + "language_loss": 0.66401744, + "learning_rate": 2.4578341925141655e-06, + "loss": 0.68599236, + "num_input_tokens_seen": 79430740, + "step": 3690, + "time_per_iteration": 2.4910192489624023 + }, + { + "auxiliary_loss_clip": 0.01179462, + "auxiliary_loss_mlp": 0.01025428, + "balance_loss_clip": 1.05479777, + "balance_loss_mlp": 1.01664853, + "epoch": 0.4438165093488847, + "flos": 38030225420160.0, + "grad_norm": 1.9121724741111004, + "language_loss": 0.71961272, + "learning_rate": 2.457075871033378e-06, + "loss": 0.74166155, + "num_input_tokens_seen": 79452615, + "step": 3691, + "time_per_iteration": 2.6078453063964844 + }, + { + "auxiliary_loss_clip": 0.01143557, + "auxiliary_loss_mlp": 0.01025529, + "balance_loss_clip": 1.05234361, + "balance_loss_mlp": 1.01735091, + "epoch": 0.44393675223952384, + "flos": 15523213996800.0, + "grad_norm": 1.9985631047790897, + "language_loss": 0.88896966, + "learning_rate": 2.4563174802145445e-06, + "loss": 0.91066051, + "num_input_tokens_seen": 79469865, + "step": 3692, + "time_per_iteration": 2.485480546951294 + }, + { + "auxiliary_loss_clip": 0.01063936, + "auxiliary_loss_mlp": 0.01003626, + "balance_loss_clip": 1.02172112, + "balance_loss_mlp": 1.00226104, + "epoch": 0.44405699513016295, + "flos": 64574893779840.0, + "grad_norm": 0.6334892616799727, + "language_loss": 0.48645616, + "learning_rate": 2.455559020172712e-06, + "loss": 0.50713181, + "num_input_tokens_seen": 79537220, + "step": 3693, + "time_per_iteration": 3.1618378162384033 + }, + { + "auxiliary_loss_clip": 0.01136697, + "auxiliary_loss_mlp": 0.01038172, + "balance_loss_clip": 1.05596483, + "balance_loss_mlp": 1.02953506, + "epoch": 0.444177238020802, + "flos": 23987394552960.0, + "grad_norm": 2.397392671542597, + "language_loss": 0.89748502, + "learning_rate": 2.4548004910229385e-06, + "loss": 0.91923368, + "num_input_tokens_seen": 79554795, + "step": 3694, + "time_per_iteration": 2.628120183944702 + }, + { + "auxiliary_loss_clip": 0.01176037, + "auxiliary_loss_mlp": 0.00764013, + "balance_loss_clip": 1.05644131, + "balance_loss_mlp": 1.00070786, + "epoch": 0.4442974809114411, + "flos": 22563069575040.0, + "grad_norm": 2.2172444591353107, + "language_loss": 0.87127411, + "learning_rate": 2.4540418928802913e-06, + "loss": 0.89067459, + "num_input_tokens_seen": 79573530, + "step": 3695, + "time_per_iteration": 2.4922924041748047 + }, + { + "auxiliary_loss_clip": 0.01156449, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.0512948, + "balance_loss_mlp": 1.02289367, + "epoch": 0.4444177238020802, + "flos": 17675699483520.0, + "grad_norm": 2.8643462519079854, + "language_loss": 0.65817106, + "learning_rate": 2.4532832258598506e-06, + "loss": 0.68005383, + "num_input_tokens_seen": 79591360, + "step": 3696, + "time_per_iteration": 3.226816415786743 + }, + { + "auxiliary_loss_clip": 0.01184374, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.05612481, + "balance_loss_mlp": 1.01759839, + "epoch": 0.4445379666927193, + "flos": 28621594609920.0, + "grad_norm": 1.7962066310276905, + "language_loss": 0.80778468, + "learning_rate": 2.4525244900767047e-06, + "loss": 0.82988906, + "num_input_tokens_seen": 79612175, + "step": 3697, + "time_per_iteration": 2.503439426422119 + }, + { + "auxiliary_loss_clip": 0.01072983, + "auxiliary_loss_mlp": 0.01001959, + "balance_loss_clip": 1.02577829, + "balance_loss_mlp": 1.00073075, + "epoch": 0.4446582095833584, + "flos": 70487370115200.0, + "grad_norm": 0.7726120850558645, + "language_loss": 0.60552257, + "learning_rate": 2.4517656856459536e-06, + "loss": 0.62627202, + "num_input_tokens_seen": 79678020, + "step": 3698, + "time_per_iteration": 3.151158332824707 + }, + { + "auxiliary_loss_clip": 0.01170439, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.05225992, + "balance_loss_mlp": 1.02581656, + "epoch": 0.4447784524739975, + "flos": 26505199313280.0, + "grad_norm": 1.656308441090315, + "language_loss": 0.68097639, + "learning_rate": 2.4510068126827073e-06, + "loss": 0.70301998, + "num_input_tokens_seen": 79699020, + "step": 3699, + "time_per_iteration": 3.3736932277679443 + }, + { + "auxiliary_loss_clip": 0.01158756, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.05345583, + "balance_loss_mlp": 1.02707577, + "epoch": 0.44489869536463655, + "flos": 11656209553920.0, + "grad_norm": 6.3595659630556005, + "language_loss": 0.81421053, + "learning_rate": 2.450247871302086e-06, + "loss": 0.8361522, + "num_input_tokens_seen": 79716795, + "step": 3700, + "time_per_iteration": 2.4832677841186523 + }, + { + "auxiliary_loss_clip": 0.01175798, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.0545212, + "balance_loss_mlp": 1.01982427, + "epoch": 0.44501893825527566, + "flos": 20448469958400.0, + "grad_norm": 2.313075096495458, + "language_loss": 0.83370793, + "learning_rate": 2.44948886161922e-06, + "loss": 0.85574299, + "num_input_tokens_seen": 79735810, + "step": 3701, + "time_per_iteration": 2.5322635173797607 + }, + { + "auxiliary_loss_clip": 0.01175748, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.05587935, + "balance_loss_mlp": 1.01868701, + "epoch": 0.4451391811459148, + "flos": 18261079430400.0, + "grad_norm": 1.6084210201101368, + "language_loss": 0.85110223, + "learning_rate": 2.4487297837492524e-06, + "loss": 0.87312371, + "num_input_tokens_seen": 79754975, + "step": 3702, + "time_per_iteration": 3.270439863204956 + }, + { + "auxiliary_loss_clip": 0.01142784, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.05190527, + "balance_loss_mlp": 1.02097893, + "epoch": 0.44525942403655383, + "flos": 16910155895040.0, + "grad_norm": 1.9247161420488583, + "language_loss": 0.62377113, + "learning_rate": 2.4479706378073323e-06, + "loss": 0.64549261, + "num_input_tokens_seen": 79773515, + "step": 3703, + "time_per_iteration": 2.6739790439605713 + }, + { + "auxiliary_loss_clip": 0.01131238, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.0463264, + "balance_loss_mlp": 1.01921606, + "epoch": 0.44537966692719294, + "flos": 23258838994560.0, + "grad_norm": 1.5727117342734176, + "language_loss": 0.83750415, + "learning_rate": 2.447211423908623e-06, + "loss": 0.8590886, + "num_input_tokens_seen": 79793560, + "step": 3704, + "time_per_iteration": 3.283668279647827 + }, + { + "auxiliary_loss_clip": 0.01173876, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.05376112, + "balance_loss_mlp": 1.01920366, + "epoch": 0.445499909817832, + "flos": 21724160457600.0, + "grad_norm": 2.295431497373646, + "language_loss": 0.7439239, + "learning_rate": 2.4464521421682966e-06, + "loss": 0.7659353, + "num_input_tokens_seen": 79811150, + "step": 3705, + "time_per_iteration": 2.464444875717163 + }, + { + "auxiliary_loss_clip": 0.01166224, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.05462337, + "balance_loss_mlp": 1.01706958, + "epoch": 0.4456201527084711, + "flos": 23987969170560.0, + "grad_norm": 1.4100867967009847, + "language_loss": 0.87436712, + "learning_rate": 2.4456927927015345e-06, + "loss": 0.89627624, + "num_input_tokens_seen": 79832190, + "step": 3706, + "time_per_iteration": 2.4772579669952393 + }, + { + "auxiliary_loss_clip": 0.0116715, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.0576086, + "balance_loss_mlp": 1.02279973, + "epoch": 0.4457403955991102, + "flos": 18807065136000.0, + "grad_norm": 2.034032312400158, + "language_loss": 0.76659322, + "learning_rate": 2.4449333756235307e-06, + "loss": 0.78858328, + "num_input_tokens_seen": 79848905, + "step": 3707, + "time_per_iteration": 2.479116439819336 + }, + { + "auxiliary_loss_clip": 0.01176042, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.05490422, + "balance_loss_mlp": 1.02784932, + "epoch": 0.4458606384897493, + "flos": 19207756327680.0, + "grad_norm": 2.225344106043708, + "language_loss": 0.78542346, + "learning_rate": 2.4441738910494876e-06, + "loss": 0.80754817, + "num_input_tokens_seen": 79863640, + "step": 3708, + "time_per_iteration": 2.4359796047210693 + }, + { + "auxiliary_loss_clip": 0.01164405, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.05187988, + "balance_loss_mlp": 1.02489424, + "epoch": 0.4459808813803884, + "flos": 21361283308800.0, + "grad_norm": 1.6699179867984635, + "language_loss": 0.82155168, + "learning_rate": 2.4434143390946176e-06, + "loss": 0.84352839, + "num_input_tokens_seen": 79882450, + "step": 3709, + "time_per_iteration": 2.4934604167938232 + }, + { + "auxiliary_loss_clip": 0.01140056, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.04868197, + "balance_loss_mlp": 1.02072358, + "epoch": 0.4461011242710275, + "flos": 23288967527040.0, + "grad_norm": 1.8439956291296107, + "language_loss": 0.85271466, + "learning_rate": 2.4426547198741457e-06, + "loss": 0.87440795, + "num_input_tokens_seen": 79900655, + "step": 3710, + "time_per_iteration": 2.5815532207489014 + }, + { + "auxiliary_loss_clip": 0.01129064, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.05202425, + "balance_loss_mlp": 1.0228343, + "epoch": 0.44622136716166655, + "flos": 20193001453440.0, + "grad_norm": 2.3639526633457377, + "language_loss": 0.74590313, + "learning_rate": 2.441895033503305e-06, + "loss": 0.76750344, + "num_input_tokens_seen": 79918575, + "step": 3711, + "time_per_iteration": 2.5526912212371826 + }, + { + "auxiliary_loss_clip": 0.0117043, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.05340171, + "balance_loss_mlp": 1.0232501, + "epoch": 0.44634161005230566, + "flos": 21283033530240.0, + "grad_norm": 1.6951554902840211, + "language_loss": 0.81890422, + "learning_rate": 2.4411352800973375e-06, + "loss": 0.84093046, + "num_input_tokens_seen": 79937010, + "step": 3712, + "time_per_iteration": 2.4535343647003174 + }, + { + "auxiliary_loss_clip": 0.01138319, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.0491575, + "balance_loss_mlp": 1.0192399, + "epoch": 0.44646185294294477, + "flos": 22929358515840.0, + "grad_norm": 2.282796522320087, + "language_loss": 0.7557168, + "learning_rate": 2.4403754597715005e-06, + "loss": 0.77737975, + "num_input_tokens_seen": 79956455, + "step": 3713, + "time_per_iteration": 2.531940460205078 + }, + { + "auxiliary_loss_clip": 0.01159664, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.04897785, + "balance_loss_mlp": 1.0256598, + "epoch": 0.4465820958335838, + "flos": 22637692080000.0, + "grad_norm": 2.360868212339568, + "language_loss": 0.92910033, + "learning_rate": 2.4396155726410553e-06, + "loss": 0.95104623, + "num_input_tokens_seen": 79975065, + "step": 3714, + "time_per_iteration": 2.5019724369049072 + }, + { + "auxiliary_loss_clip": 0.01177362, + "auxiliary_loss_mlp": 0.01027234, + "balance_loss_clip": 1.0527432, + "balance_loss_mlp": 1.01925325, + "epoch": 0.44670233872422294, + "flos": 22672525294080.0, + "grad_norm": 2.4848116740977697, + "language_loss": 0.91114068, + "learning_rate": 2.438855618821278e-06, + "loss": 0.93318659, + "num_input_tokens_seen": 79990865, + "step": 3715, + "time_per_iteration": 2.5113203525543213 + }, + { + "auxiliary_loss_clip": 0.01162222, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.04904628, + "balance_loss_mlp": 1.02264428, + "epoch": 0.44682258161486205, + "flos": 23582178247680.0, + "grad_norm": 1.6282300358166526, + "language_loss": 0.67165083, + "learning_rate": 2.4380955984274517e-06, + "loss": 0.69358444, + "num_input_tokens_seen": 80009520, + "step": 3716, + "time_per_iteration": 2.4700863361358643 + }, + { + "auxiliary_loss_clip": 0.01169011, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.05113196, + "balance_loss_mlp": 1.02758896, + "epoch": 0.4469428245055011, + "flos": 26501356558080.0, + "grad_norm": 1.658853914927923, + "language_loss": 0.76939428, + "learning_rate": 2.4373355115748716e-06, + "loss": 0.79144108, + "num_input_tokens_seen": 80030350, + "step": 3717, + "time_per_iteration": 2.5420291423797607 + }, + { + "auxiliary_loss_clip": 0.01150738, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.05203068, + "balance_loss_mlp": 1.02274573, + "epoch": 0.4470630673961402, + "flos": 21504925797120.0, + "grad_norm": 1.686356699989055, + "language_loss": 0.72169089, + "learning_rate": 2.436575358378842e-06, + "loss": 0.74351323, + "num_input_tokens_seen": 80049840, + "step": 3718, + "time_per_iteration": 2.50437331199646 + }, + { + "auxiliary_loss_clip": 0.01167251, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.05460501, + "balance_loss_mlp": 1.02271855, + "epoch": 0.44718331028677927, + "flos": 16173986653440.0, + "grad_norm": 3.6961995642420487, + "language_loss": 0.83117914, + "learning_rate": 2.4358151389546782e-06, + "loss": 0.85317075, + "num_input_tokens_seen": 80066525, + "step": 3719, + "time_per_iteration": 2.4982588291168213 + }, + { + "auxiliary_loss_clip": 0.01185726, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.05546641, + "balance_loss_mlp": 1.02307987, + "epoch": 0.4473035531774184, + "flos": 19681238430720.0, + "grad_norm": 2.215861306443257, + "language_loss": 0.76690769, + "learning_rate": 2.4350548534177035e-06, + "loss": 0.78908229, + "num_input_tokens_seen": 80083355, + "step": 3720, + "time_per_iteration": 2.399819850921631 + }, + { + "auxiliary_loss_clip": 0.01142642, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.0513339, + "balance_loss_mlp": 1.02521062, + "epoch": 0.4474237960680575, + "flos": 41427590515200.0, + "grad_norm": 1.5486249133206893, + "language_loss": 0.6682502, + "learning_rate": 2.434294501883254e-06, + "loss": 0.69000554, + "num_input_tokens_seen": 80106450, + "step": 3721, + "time_per_iteration": 2.7164411544799805 + }, + { + "auxiliary_loss_clip": 0.01146654, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.04784989, + "balance_loss_mlp": 1.02148521, + "epoch": 0.44754403895869654, + "flos": 22891328991360.0, + "grad_norm": 1.6817321486877723, + "language_loss": 0.65677023, + "learning_rate": 2.433534084466674e-06, + "loss": 0.67853791, + "num_input_tokens_seen": 80125670, + "step": 3722, + "time_per_iteration": 3.3199877738952637 + }, + { + "auxiliary_loss_clip": 0.01181532, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.05417609, + "balance_loss_mlp": 1.01946068, + "epoch": 0.44766428184933565, + "flos": 25630271832960.0, + "grad_norm": 1.6413005810510426, + "language_loss": 0.7104544, + "learning_rate": 2.4327736012833178e-06, + "loss": 0.73254716, + "num_input_tokens_seen": 80147390, + "step": 3723, + "time_per_iteration": 2.4865431785583496 + }, + { + "auxiliary_loss_clip": 0.01171549, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.05430961, + "balance_loss_mlp": 1.02479935, + "epoch": 0.44778452473997477, + "flos": 20448972748800.0, + "grad_norm": 2.0468483539001556, + "language_loss": 0.76963258, + "learning_rate": 2.4320130524485506e-06, + "loss": 0.7916801, + "num_input_tokens_seen": 80166185, + "step": 3724, + "time_per_iteration": 2.446181058883667 + }, + { + "auxiliary_loss_clip": 0.01151242, + "auxiliary_loss_mlp": 0.010251, + "balance_loss_clip": 1.05543447, + "balance_loss_mlp": 1.01764607, + "epoch": 0.4479047676306138, + "flos": 21975462984960.0, + "grad_norm": 4.294670207796363, + "language_loss": 0.79626513, + "learning_rate": 2.431252438077746e-06, + "loss": 0.81802857, + "num_input_tokens_seen": 80185685, + "step": 3725, + "time_per_iteration": 3.3264570236206055 + }, + { + "auxiliary_loss_clip": 0.01174841, + "auxiliary_loss_mlp": 0.00763459, + "balance_loss_clip": 1.05286789, + "balance_loss_mlp": 1.00054598, + "epoch": 0.44802501052125293, + "flos": 21467219495040.0, + "grad_norm": 2.3404871547673305, + "language_loss": 0.7730754, + "learning_rate": 2.4304917582862906e-06, + "loss": 0.79245836, + "num_input_tokens_seen": 80204865, + "step": 3726, + "time_per_iteration": 2.4551732540130615 + }, + { + "auxiliary_loss_clip": 0.01183236, + "auxiliary_loss_mlp": 0.01028018, + "balance_loss_clip": 1.05398405, + "balance_loss_mlp": 1.01988196, + "epoch": 0.44814525341189204, + "flos": 22126970551680.0, + "grad_norm": 1.904719991320197, + "language_loss": 0.8765772, + "learning_rate": 2.4297310131895774e-06, + "loss": 0.89868975, + "num_input_tokens_seen": 80223410, + "step": 3727, + "time_per_iteration": 2.4239656925201416 + }, + { + "auxiliary_loss_clip": 0.01169893, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.05285311, + "balance_loss_mlp": 1.02307653, + "epoch": 0.4482654963025311, + "flos": 16653933204480.0, + "grad_norm": 1.94850857925264, + "language_loss": 0.74924505, + "learning_rate": 2.4289702029030113e-06, + "loss": 0.77126175, + "num_input_tokens_seen": 80240880, + "step": 3728, + "time_per_iteration": 3.241889715194702 + }, + { + "auxiliary_loss_clip": 0.01171947, + "auxiliary_loss_mlp": 0.01027051, + "balance_loss_clip": 1.05626512, + "balance_loss_mlp": 1.01886129, + "epoch": 0.4483857391931702, + "flos": 18841251905280.0, + "grad_norm": 1.7704911897689553, + "language_loss": 0.83301461, + "learning_rate": 2.4282093275420057e-06, + "loss": 0.85500461, + "num_input_tokens_seen": 80259910, + "step": 3729, + "time_per_iteration": 2.490283489227295 + }, + { + "auxiliary_loss_clip": 0.01175912, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.05572116, + "balance_loss_mlp": 1.02324331, + "epoch": 0.4485059820838093, + "flos": 20372590477440.0, + "grad_norm": 2.1172448537654196, + "language_loss": 0.70855886, + "learning_rate": 2.4274483872219863e-06, + "loss": 0.73062837, + "num_input_tokens_seen": 80277270, + "step": 3730, + "time_per_iteration": 2.428694009780884 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.05191278, + "balance_loss_mlp": 1.02068686, + "epoch": 0.4486262249744484, + "flos": 20047742853120.0, + "grad_norm": 1.7888091077398294, + "language_loss": 0.93992376, + "learning_rate": 2.426687382058386e-06, + "loss": 0.96187705, + "num_input_tokens_seen": 80295550, + "step": 3731, + "time_per_iteration": 3.2148828506469727 + }, + { + "auxiliary_loss_clip": 0.0107036, + "auxiliary_loss_mlp": 0.01004385, + "balance_loss_clip": 1.02395248, + "balance_loss_mlp": 1.00322914, + "epoch": 0.4487464678650875, + "flos": 64595684776320.0, + "grad_norm": 0.8617764825105491, + "language_loss": 0.59841961, + "learning_rate": 2.425926312166649e-06, + "loss": 0.61916709, + "num_input_tokens_seen": 80348425, + "step": 3732, + "time_per_iteration": 2.925603151321411 + }, + { + "auxiliary_loss_clip": 0.01161711, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.05454564, + "balance_loss_mlp": 1.01705253, + "epoch": 0.4488667107557266, + "flos": 20769798049920.0, + "grad_norm": 2.0758542749481492, + "language_loss": 0.72705811, + "learning_rate": 2.42516517766223e-06, + "loss": 0.74893355, + "num_input_tokens_seen": 80366505, + "step": 3733, + "time_per_iteration": 2.46803617477417 + }, + { + "auxiliary_loss_clip": 0.01184597, + "auxiliary_loss_mlp": 0.01027602, + "balance_loss_clip": 1.05712795, + "balance_loss_mlp": 1.01927507, + "epoch": 0.44898695364636565, + "flos": 23951735326080.0, + "grad_norm": 2.402057180054235, + "language_loss": 0.68073934, + "learning_rate": 2.4244039786605907e-06, + "loss": 0.70286131, + "num_input_tokens_seen": 80387510, + "step": 3734, + "time_per_iteration": 2.461834669113159 + }, + { + "auxiliary_loss_clip": 0.01126024, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.04622388, + "balance_loss_mlp": 1.01857138, + "epoch": 0.44910719653700476, + "flos": 18624351628800.0, + "grad_norm": 2.326495437943824, + "language_loss": 0.82343304, + "learning_rate": 2.4236427152772055e-06, + "loss": 0.84496337, + "num_input_tokens_seen": 80405915, + "step": 3735, + "time_per_iteration": 2.5284368991851807 + }, + { + "auxiliary_loss_clip": 0.01036357, + "auxiliary_loss_mlp": 0.01002037, + "balance_loss_clip": 1.01779127, + "balance_loss_mlp": 1.00080955, + "epoch": 0.4492274394276438, + "flos": 57033435749760.0, + "grad_norm": 0.828588795770772, + "language_loss": 0.57338703, + "learning_rate": 2.422881387627557e-06, + "loss": 0.59377098, + "num_input_tokens_seen": 80458365, + "step": 3736, + "time_per_iteration": 2.827345371246338 + }, + { + "auxiliary_loss_clip": 0.01159183, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.05367064, + "balance_loss_mlp": 1.01621163, + "epoch": 0.4493476823182829, + "flos": 23254888498560.0, + "grad_norm": 1.5475513603628974, + "language_loss": 0.7744534, + "learning_rate": 2.422119995827139e-06, + "loss": 0.79628628, + "num_input_tokens_seen": 80478490, + "step": 3737, + "time_per_iteration": 2.501750946044922 + }, + { + "auxiliary_loss_clip": 0.01173609, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.05444133, + "balance_loss_mlp": 1.02088773, + "epoch": 0.44946792520892204, + "flos": 15815131827840.0, + "grad_norm": 2.839449450650751, + "language_loss": 0.74221724, + "learning_rate": 2.4213585399914528e-06, + "loss": 0.76424158, + "num_input_tokens_seen": 80495695, + "step": 3738, + "time_per_iteration": 2.4529948234558105 + }, + { + "auxiliary_loss_clip": 0.01171071, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.05458117, + "balance_loss_mlp": 1.02071047, + "epoch": 0.4495881680995611, + "flos": 19610063631360.0, + "grad_norm": 1.738964745627871, + "language_loss": 0.85120904, + "learning_rate": 2.4205970202360113e-06, + "loss": 0.87320769, + "num_input_tokens_seen": 80515260, + "step": 3739, + "time_per_iteration": 2.488502025604248 + }, + { + "auxiliary_loss_clip": 0.01118295, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.04790878, + "balance_loss_mlp": 1.01862633, + "epoch": 0.4497084109902002, + "flos": 26031465815040.0, + "grad_norm": 1.8509698700314674, + "language_loss": 0.77969623, + "learning_rate": 2.4198354366763354e-06, + "loss": 0.80115259, + "num_input_tokens_seen": 80533900, + "step": 3740, + "time_per_iteration": 2.5950775146484375 + }, + { + "auxiliary_loss_clip": 0.01160249, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.05328619, + "balance_loss_mlp": 1.01825058, + "epoch": 0.4498286538808393, + "flos": 14793688771200.0, + "grad_norm": 2.2453774512902474, + "language_loss": 0.78749019, + "learning_rate": 2.4190737894279587e-06, + "loss": 0.80935907, + "num_input_tokens_seen": 80551270, + "step": 3741, + "time_per_iteration": 2.5710997581481934 + }, + { + "auxiliary_loss_clip": 0.01130927, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.04537284, + "balance_loss_mlp": 1.01971555, + "epoch": 0.44994889677147837, + "flos": 15450171690240.0, + "grad_norm": 2.0451407789837037, + "language_loss": 0.80230498, + "learning_rate": 2.4183120786064203e-06, + "loss": 0.82388908, + "num_input_tokens_seen": 80568145, + "step": 3742, + "time_per_iteration": 2.496428966522217 + }, + { + "auxiliary_loss_clip": 0.01170937, + "auxiliary_loss_mlp": 0.00762754, + "balance_loss_clip": 1.05686092, + "balance_loss_mlp": 1.00051403, + "epoch": 0.4500691396621175, + "flos": 21798316085760.0, + "grad_norm": 2.2590118283325, + "language_loss": 0.85631835, + "learning_rate": 2.417550304327273e-06, + "loss": 0.87565523, + "num_input_tokens_seen": 80586185, + "step": 3743, + "time_per_iteration": 2.535747766494751 + }, + { + "auxiliary_loss_clip": 0.01185947, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.05553615, + "balance_loss_mlp": 1.02451897, + "epoch": 0.4501893825527566, + "flos": 32382016421760.0, + "grad_norm": 1.6238272100642819, + "language_loss": 0.75857115, + "learning_rate": 2.4167884667060763e-06, + "loss": 0.780761, + "num_input_tokens_seen": 80608895, + "step": 3744, + "time_per_iteration": 2.560973882675171 + }, + { + "auxiliary_loss_clip": 0.011562, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.05149508, + "balance_loss_mlp": 1.02273417, + "epoch": 0.45030962544339564, + "flos": 16544944362240.0, + "grad_norm": 2.0841093455484585, + "language_loss": 0.87233818, + "learning_rate": 2.4160265658584e-06, + "loss": 0.89421046, + "num_input_tokens_seen": 80623785, + "step": 3745, + "time_per_iteration": 2.479842185974121 + }, + { + "auxiliary_loss_clip": 0.01175097, + "auxiliary_loss_mlp": 0.01026984, + "balance_loss_clip": 1.05516124, + "balance_loss_mlp": 1.01866317, + "epoch": 0.45042986833403476, + "flos": 19573039687680.0, + "grad_norm": 1.9375556610580837, + "language_loss": 0.68178022, + "learning_rate": 2.4152646018998253e-06, + "loss": 0.7038011, + "num_input_tokens_seen": 80642735, + "step": 3746, + "time_per_iteration": 2.4767673015594482 + }, + { + "auxiliary_loss_clip": 0.01167417, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.05326366, + "balance_loss_mlp": 1.02888441, + "epoch": 0.45055011122467387, + "flos": 23112467072640.0, + "grad_norm": 2.4059667216935856, + "language_loss": 0.71638024, + "learning_rate": 2.4145025749459403e-06, + "loss": 0.73842341, + "num_input_tokens_seen": 80663760, + "step": 3747, + "time_per_iteration": 2.540201187133789 + }, + { + "auxiliary_loss_clip": 0.01100442, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.04926336, + "balance_loss_mlp": 1.02842689, + "epoch": 0.4506703541153129, + "flos": 19934623946880.0, + "grad_norm": 3.1361848550496743, + "language_loss": 0.70132393, + "learning_rate": 2.413740485112344e-06, + "loss": 0.72269857, + "num_input_tokens_seen": 80682100, + "step": 3748, + "time_per_iteration": 2.6768319606781006 + }, + { + "auxiliary_loss_clip": 0.01149807, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.05397427, + "balance_loss_mlp": 1.01729381, + "epoch": 0.45079059700595203, + "flos": 19499530504320.0, + "grad_norm": 1.6708286229857756, + "language_loss": 0.82146597, + "learning_rate": 2.412978332514646e-06, + "loss": 0.84322149, + "num_input_tokens_seen": 80700880, + "step": 3749, + "time_per_iteration": 3.3277199268341064 + }, + { + "auxiliary_loss_clip": 0.01160279, + "auxiliary_loss_mlp": 0.01025385, + "balance_loss_clip": 1.05437994, + "balance_loss_mlp": 1.01671219, + "epoch": 0.4509108398965911, + "flos": 27636313570560.0, + "grad_norm": 2.5039223583520607, + "language_loss": 0.71925116, + "learning_rate": 2.4122161172684623e-06, + "loss": 0.74110776, + "num_input_tokens_seen": 80721675, + "step": 3750, + "time_per_iteration": 2.653538942337036 + }, + { + "auxiliary_loss_clip": 0.01159929, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.05353808, + "balance_loss_mlp": 1.02708101, + "epoch": 0.4510310827872302, + "flos": 20995712640000.0, + "grad_norm": 4.125961595740625, + "language_loss": 0.83858192, + "learning_rate": 2.4114538394894216e-06, + "loss": 0.86054045, + "num_input_tokens_seen": 80739315, + "step": 3751, + "time_per_iteration": 2.5357472896575928 + }, + { + "auxiliary_loss_clip": 0.01152362, + "auxiliary_loss_mlp": 0.01025031, + "balance_loss_clip": 1.04778242, + "balance_loss_mlp": 1.01704431, + "epoch": 0.4511513256778693, + "flos": 16216684945920.0, + "grad_norm": 1.9909336664787296, + "language_loss": 0.83124518, + "learning_rate": 2.410691499293161e-06, + "loss": 0.85301912, + "num_input_tokens_seen": 80757470, + "step": 3752, + "time_per_iteration": 3.3600518703460693 + }, + { + "auxiliary_loss_clip": 0.0116877, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.05273366, + "balance_loss_mlp": 1.01769948, + "epoch": 0.45127156856850836, + "flos": 25186702780800.0, + "grad_norm": 1.586291049664566, + "language_loss": 0.74620748, + "learning_rate": 2.409929096795326e-06, + "loss": 0.76815522, + "num_input_tokens_seen": 80777840, + "step": 3753, + "time_per_iteration": 2.5574533939361572 + }, + { + "auxiliary_loss_clip": 0.01170796, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.05291188, + "balance_loss_mlp": 1.02073836, + "epoch": 0.4513918114591475, + "flos": 20412523422720.0, + "grad_norm": 1.8514644660422697, + "language_loss": 0.79435682, + "learning_rate": 2.409166632111573e-06, + "loss": 0.81636047, + "num_input_tokens_seen": 80795975, + "step": 3754, + "time_per_iteration": 2.5205471515655518 + }, + { + "auxiliary_loss_clip": 0.01178116, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.05422628, + "balance_loss_mlp": 1.01752615, + "epoch": 0.4515120543497866, + "flos": 26648482665600.0, + "grad_norm": 1.8717835396622062, + "language_loss": 0.80601835, + "learning_rate": 2.4084041053575674e-06, + "loss": 0.82806146, + "num_input_tokens_seen": 80815395, + "step": 3755, + "time_per_iteration": 3.294748544692993 + }, + { + "auxiliary_loss_clip": 0.0116126, + "auxiliary_loss_mlp": 0.01025496, + "balance_loss_clip": 1.05414999, + "balance_loss_mlp": 1.01704431, + "epoch": 0.45163229724042564, + "flos": 20595093275520.0, + "grad_norm": 2.241044103456221, + "language_loss": 0.7234416, + "learning_rate": 2.4076415166489834e-06, + "loss": 0.74530923, + "num_input_tokens_seen": 80834805, + "step": 3756, + "time_per_iteration": 2.4837145805358887 + }, + { + "auxiliary_loss_clip": 0.01133074, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.05036211, + "balance_loss_mlp": 1.02398205, + "epoch": 0.45175254013106475, + "flos": 21689004021120.0, + "grad_norm": 2.168584034022373, + "language_loss": 0.78813004, + "learning_rate": 2.406878866101506e-06, + "loss": 0.80977929, + "num_input_tokens_seen": 80853770, + "step": 3757, + "time_per_iteration": 2.6080996990203857 + }, + { + "auxiliary_loss_clip": 0.0118539, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.05763328, + "balance_loss_mlp": 1.02081299, + "epoch": 0.45187278302170386, + "flos": 18878850466560.0, + "grad_norm": 1.9961197332170317, + "language_loss": 0.78091234, + "learning_rate": 2.4061161538308273e-06, + "loss": 0.80305159, + "num_input_tokens_seen": 80870615, + "step": 3758, + "time_per_iteration": 3.177151679992676 + }, + { + "auxiliary_loss_clip": 0.0116982, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.05452859, + "balance_loss_mlp": 1.01926851, + "epoch": 0.4519930259123429, + "flos": 18582479349120.0, + "grad_norm": 1.8890221530485913, + "language_loss": 0.88957685, + "learning_rate": 2.4053533799526523e-06, + "loss": 0.91155028, + "num_input_tokens_seen": 80886335, + "step": 3759, + "time_per_iteration": 2.453939199447632 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.05201554, + "balance_loss_mlp": 1.02288747, + "epoch": 0.452113268802982, + "flos": 25192377129600.0, + "grad_norm": 1.6118747846566377, + "language_loss": 0.8601687, + "learning_rate": 2.404590544582691e-06, + "loss": 0.88198078, + "num_input_tokens_seen": 80904570, + "step": 3760, + "time_per_iteration": 2.5175836086273193 + }, + { + "auxiliary_loss_clip": 0.01129549, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.04423451, + "balance_loss_mlp": 1.02413559, + "epoch": 0.45223351169362114, + "flos": 39378922312320.0, + "grad_norm": 1.7108712405252762, + "language_loss": 0.80917883, + "learning_rate": 2.403827647836666e-06, + "loss": 0.8307991, + "num_input_tokens_seen": 80925125, + "step": 3761, + "time_per_iteration": 2.7732365131378174 + }, + { + "auxiliary_loss_clip": 0.01185998, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.05456853, + "balance_loss_mlp": 1.02126551, + "epoch": 0.4523537545842602, + "flos": 21582169994880.0, + "grad_norm": 1.8969500765232759, + "language_loss": 0.69223797, + "learning_rate": 2.4030646898303075e-06, + "loss": 0.71439648, + "num_input_tokens_seen": 80946615, + "step": 3762, + "time_per_iteration": 2.5373876094818115 + }, + { + "auxiliary_loss_clip": 0.01161509, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.05302978, + "balance_loss_mlp": 1.02747226, + "epoch": 0.4524739974748993, + "flos": 28439527547520.0, + "grad_norm": 2.2145640261037527, + "language_loss": 0.81676447, + "learning_rate": 2.4023016706793566e-06, + "loss": 0.83873606, + "num_input_tokens_seen": 80966410, + "step": 3763, + "time_per_iteration": 2.604886531829834 + }, + { + "auxiliary_loss_clip": 0.01056349, + "auxiliary_loss_mlp": 0.01003428, + "balance_loss_clip": 1.01989651, + "balance_loss_mlp": 1.00230718, + "epoch": 0.4525942403655384, + "flos": 61556492148480.0, + "grad_norm": 0.7605593372196167, + "language_loss": 0.56882799, + "learning_rate": 2.401538590499561e-06, + "loss": 0.58942574, + "num_input_tokens_seen": 81026865, + "step": 3764, + "time_per_iteration": 3.249357223510742 + }, + { + "auxiliary_loss_clip": 0.01173899, + "auxiliary_loss_mlp": 0.00763177, + "balance_loss_clip": 1.05513811, + "balance_loss_mlp": 1.00044131, + "epoch": 0.45271448325617747, + "flos": 27529838680320.0, + "grad_norm": 2.130759408348567, + "language_loss": 0.71776503, + "learning_rate": 2.400775449406682e-06, + "loss": 0.73713577, + "num_input_tokens_seen": 81050060, + "step": 3765, + "time_per_iteration": 2.6272900104522705 + }, + { + "auxiliary_loss_clip": 0.01169285, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.0516479, + "balance_loss_mlp": 1.02304423, + "epoch": 0.4528347261468166, + "flos": 22452608275200.0, + "grad_norm": 1.7924407815249894, + "language_loss": 0.72836196, + "learning_rate": 2.400012247516485e-06, + "loss": 0.75036359, + "num_input_tokens_seen": 81070625, + "step": 3766, + "time_per_iteration": 2.55890154838562 + }, + { + "auxiliary_loss_clip": 0.01144676, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.04866433, + "balance_loss_mlp": 1.02052093, + "epoch": 0.45295496903745563, + "flos": 21103875469440.0, + "grad_norm": 1.8522468763111193, + "language_loss": 0.90215921, + "learning_rate": 2.3992489849447484e-06, + "loss": 0.92389202, + "num_input_tokens_seen": 81089080, + "step": 3767, + "time_per_iteration": 2.5342369079589844 + }, + { + "auxiliary_loss_clip": 0.01146481, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.04963183, + "balance_loss_mlp": 1.01989055, + "epoch": 0.45307521192809475, + "flos": 23221168606080.0, + "grad_norm": 1.593612532900772, + "language_loss": 0.78877389, + "learning_rate": 2.3984856618072584e-06, + "loss": 0.8105194, + "num_input_tokens_seen": 81109115, + "step": 3768, + "time_per_iteration": 2.526691198348999 + }, + { + "auxiliary_loss_clip": 0.01147603, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.05243802, + "balance_loss_mlp": 1.02458751, + "epoch": 0.45319545481873386, + "flos": 15560094286080.0, + "grad_norm": 1.9856900933397736, + "language_loss": 0.73791873, + "learning_rate": 2.3977222782198098e-06, + "loss": 0.75972319, + "num_input_tokens_seen": 81127750, + "step": 3769, + "time_per_iteration": 2.4951794147491455 + }, + { + "auxiliary_loss_clip": 0.01133546, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.04953289, + "balance_loss_mlp": 1.02744913, + "epoch": 0.4533156977093729, + "flos": 21944759834880.0, + "grad_norm": 1.5617537540052147, + "language_loss": 0.75110316, + "learning_rate": 2.3969588342982077e-06, + "loss": 0.77281058, + "num_input_tokens_seen": 81147125, + "step": 3770, + "time_per_iteration": 2.5324506759643555 + }, + { + "auxiliary_loss_clip": 0.01168656, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.05515385, + "balance_loss_mlp": 1.02136564, + "epoch": 0.453435940600012, + "flos": 24242180699520.0, + "grad_norm": 1.5061330962328552, + "language_loss": 0.72658408, + "learning_rate": 2.396195330158267e-06, + "loss": 0.74856937, + "num_input_tokens_seen": 81167015, + "step": 3771, + "time_per_iteration": 2.49704909324646 + }, + { + "auxiliary_loss_clip": 0.01184709, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.05495811, + "balance_loss_mlp": 1.01910233, + "epoch": 0.45355618349065113, + "flos": 23440367352960.0, + "grad_norm": 1.6790523680169442, + "language_loss": 0.79651469, + "learning_rate": 2.3954317659158094e-06, + "loss": 0.81863713, + "num_input_tokens_seen": 81187350, + "step": 3772, + "time_per_iteration": 2.465524435043335 + }, + { + "auxiliary_loss_clip": 0.01080716, + "auxiliary_loss_mlp": 0.01000878, + "balance_loss_clip": 1.01787233, + "balance_loss_mlp": 0.99976921, + "epoch": 0.4536764263812902, + "flos": 66903161448960.0, + "grad_norm": 0.8945363325813748, + "language_loss": 0.56934851, + "learning_rate": 2.394668141686667e-06, + "loss": 0.59016442, + "num_input_tokens_seen": 81249315, + "step": 3773, + "time_per_iteration": 3.051337718963623 + }, + { + "auxiliary_loss_clip": 0.0116444, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.0500102, + "balance_loss_mlp": 1.0203923, + "epoch": 0.4537966692719293, + "flos": 42739766254080.0, + "grad_norm": 2.0690370994660694, + "language_loss": 0.69371283, + "learning_rate": 2.3939044575866813e-06, + "loss": 0.71563989, + "num_input_tokens_seen": 81272065, + "step": 3774, + "time_per_iteration": 2.6522011756896973 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.0076308, + "balance_loss_clip": 1.04903984, + "balance_loss_mlp": 1.0004549, + "epoch": 0.4539169121625684, + "flos": 35549480517120.0, + "grad_norm": 2.1064599192742692, + "language_loss": 0.75257522, + "learning_rate": 2.3931407137317024e-06, + "loss": 0.77170622, + "num_input_tokens_seen": 81292220, + "step": 3775, + "time_per_iteration": 2.6045687198638916 + }, + { + "auxiliary_loss_clip": 0.01138393, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.04696679, + "balance_loss_mlp": 1.0239501, + "epoch": 0.45403715505320746, + "flos": 18514716341760.0, + "grad_norm": 1.6599063037356312, + "language_loss": 0.85206449, + "learning_rate": 2.3923769102375907e-06, + "loss": 0.87377477, + "num_input_tokens_seen": 81311085, + "step": 3776, + "time_per_iteration": 3.357940912246704 + }, + { + "auxiliary_loss_clip": 0.01141734, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.04974818, + "balance_loss_mlp": 1.0260148, + "epoch": 0.4541573979438466, + "flos": 25045825639680.0, + "grad_norm": 2.5178914633459057, + "language_loss": 0.78348935, + "learning_rate": 2.391613047220213e-06, + "loss": 0.80525553, + "num_input_tokens_seen": 81330985, + "step": 3777, + "time_per_iteration": 2.5558977127075195 + }, + { + "auxiliary_loss_clip": 0.01133966, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.04943693, + "balance_loss_mlp": 1.01796913, + "epoch": 0.4542776408344857, + "flos": 18332397884160.0, + "grad_norm": 2.0353926887114646, + "language_loss": 0.79293275, + "learning_rate": 2.390849124795447e-06, + "loss": 0.81453454, + "num_input_tokens_seen": 81346985, + "step": 3778, + "time_per_iteration": 2.5255467891693115 + }, + { + "auxiliary_loss_clip": 0.01185832, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.05525661, + "balance_loss_mlp": 1.01935673, + "epoch": 0.45439788372512474, + "flos": 20701173116160.0, + "grad_norm": 2.0985837238693246, + "language_loss": 0.84202588, + "learning_rate": 2.3900851430791804e-06, + "loss": 0.86415815, + "num_input_tokens_seen": 81365005, + "step": 3779, + "time_per_iteration": 3.2632970809936523 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.05380249, + "balance_loss_mlp": 1.0232048, + "epoch": 0.45451812661576385, + "flos": 22309432663680.0, + "grad_norm": 2.3253835641828347, + "language_loss": 0.84480727, + "learning_rate": 2.389321102187307e-06, + "loss": 0.86700326, + "num_input_tokens_seen": 81383785, + "step": 3780, + "time_per_iteration": 2.4269118309020996 + }, + { + "auxiliary_loss_clip": 0.0115723, + "auxiliary_loss_mlp": 0.00763807, + "balance_loss_clip": 1.05281854, + "balance_loss_mlp": 1.00042903, + "epoch": 0.4546383695064029, + "flos": 21763303303680.0, + "grad_norm": 1.8859639538980257, + "language_loss": 0.8174938, + "learning_rate": 2.3885570022357326e-06, + "loss": 0.83670413, + "num_input_tokens_seen": 81402915, + "step": 3781, + "time_per_iteration": 3.2148799896240234 + }, + { + "auxiliary_loss_clip": 0.01051458, + "auxiliary_loss_mlp": 0.01005437, + "balance_loss_clip": 1.01503432, + "balance_loss_mlp": 1.00407243, + "epoch": 0.454758612397042, + "flos": 64242755694720.0, + "grad_norm": 0.8152794847847367, + "language_loss": 0.60924393, + "learning_rate": 2.38779284334037e-06, + "loss": 0.6298129, + "num_input_tokens_seen": 81467890, + "step": 3782, + "time_per_iteration": 3.1294891834259033 + }, + { + "auxiliary_loss_clip": 0.01115241, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.04538035, + "balance_loss_mlp": 1.02230966, + "epoch": 0.4548788552876811, + "flos": 27304175485440.0, + "grad_norm": 1.92047963167719, + "language_loss": 0.78968054, + "learning_rate": 2.387028625617141e-06, + "loss": 0.81114072, + "num_input_tokens_seen": 81487105, + "step": 3783, + "time_per_iteration": 2.592634677886963 + }, + { + "auxiliary_loss_clip": 0.01142262, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.04887736, + "balance_loss_mlp": 1.02086866, + "epoch": 0.4549990981783202, + "flos": 22857142222080.0, + "grad_norm": 1.8124798147360146, + "language_loss": 0.84771532, + "learning_rate": 2.3862643491819766e-06, + "loss": 0.86942792, + "num_input_tokens_seen": 81505670, + "step": 3784, + "time_per_iteration": 3.275575876235962 + }, + { + "auxiliary_loss_clip": 0.01165969, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.04948103, + "balance_loss_mlp": 1.02048779, + "epoch": 0.4551193410689593, + "flos": 23258587599360.0, + "grad_norm": 1.7297506885774971, + "language_loss": 0.84381652, + "learning_rate": 2.3855000141508186e-06, + "loss": 0.86576068, + "num_input_tokens_seen": 81525825, + "step": 3785, + "time_per_iteration": 2.4763951301574707 + }, + { + "auxiliary_loss_clip": 0.01161859, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.05582595, + "balance_loss_mlp": 1.02308285, + "epoch": 0.4552395839595984, + "flos": 20777519473920.0, + "grad_norm": 2.094454743344971, + "language_loss": 0.83995044, + "learning_rate": 2.3847356206396143e-06, + "loss": 0.86188877, + "num_input_tokens_seen": 81543135, + "step": 3786, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.0118489, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.0552392, + "balance_loss_mlp": 1.01892543, + "epoch": 0.45535982685023746, + "flos": 23257510191360.0, + "grad_norm": 1.5030745684404934, + "language_loss": 0.78684652, + "learning_rate": 2.3839711687643227e-06, + "loss": 0.80896932, + "num_input_tokens_seen": 81564360, + "step": 3787, + "time_per_iteration": 2.4950311183929443 + }, + { + "auxiliary_loss_clip": 0.01171028, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.05355573, + "balance_loss_mlp": 1.02124071, + "epoch": 0.45548006974087657, + "flos": 19646117907840.0, + "grad_norm": 1.9704479939940027, + "language_loss": 0.73939848, + "learning_rate": 2.38320665864091e-06, + "loss": 0.76141357, + "num_input_tokens_seen": 81583710, + "step": 3788, + "time_per_iteration": 2.4467294216156006 + }, + { + "auxiliary_loss_clip": 0.01114179, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.04512429, + "balance_loss_mlp": 1.01945496, + "epoch": 0.4556003126315157, + "flos": 20047778766720.0, + "grad_norm": 1.7535612715074247, + "language_loss": 0.82016379, + "learning_rate": 2.3824420903853516e-06, + "loss": 0.84158474, + "num_input_tokens_seen": 81602175, + "step": 3789, + "time_per_iteration": 2.6008527278900146 + }, + { + "auxiliary_loss_clip": 0.01170612, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.0553441, + "balance_loss_mlp": 1.01972854, + "epoch": 0.45572055552215474, + "flos": 22959738443520.0, + "grad_norm": 2.1186014518042464, + "language_loss": 0.81559336, + "learning_rate": 2.3816774641136324e-06, + "loss": 0.83757997, + "num_input_tokens_seen": 81619430, + "step": 3790, + "time_per_iteration": 2.4550976753234863 + }, + { + "auxiliary_loss_clip": 0.01168393, + "auxiliary_loss_mlp": 0.00763052, + "balance_loss_clip": 1.05338836, + "balance_loss_mlp": 1.00040102, + "epoch": 0.45584079841279385, + "flos": 33109925535360.0, + "grad_norm": 1.8800406877513136, + "language_loss": 0.71113884, + "learning_rate": 2.380912779941745e-06, + "loss": 0.73045325, + "num_input_tokens_seen": 81642550, + "step": 3791, + "time_per_iteration": 2.550929546356201 + }, + { + "auxiliary_loss_clip": 0.01171577, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.05014384, + "balance_loss_mlp": 1.0277189, + "epoch": 0.45596104130343296, + "flos": 27272179445760.0, + "grad_norm": 1.8494476814431424, + "language_loss": 0.82886106, + "learning_rate": 2.3801480379856918e-06, + "loss": 0.85095298, + "num_input_tokens_seen": 81664260, + "step": 3792, + "time_per_iteration": 2.5273876190185547 + }, + { + "auxiliary_loss_clip": 0.01158897, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.05326509, + "balance_loss_mlp": 1.02513969, + "epoch": 0.456081284194072, + "flos": 21579799697280.0, + "grad_norm": 1.7286153991026363, + "language_loss": 0.83240891, + "learning_rate": 2.379383238361484e-06, + "loss": 0.85432923, + "num_input_tokens_seen": 81683620, + "step": 3793, + "time_per_iteration": 2.487133741378784 + }, + { + "auxiliary_loss_clip": 0.01166915, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.05071235, + "balance_loss_mlp": 1.02147329, + "epoch": 0.4562015270847111, + "flos": 35918822113920.0, + "grad_norm": 2.4842672936168753, + "language_loss": 0.79355621, + "learning_rate": 2.3786183811851407e-06, + "loss": 0.81552267, + "num_input_tokens_seen": 81704325, + "step": 3794, + "time_per_iteration": 2.5594379901885986 + }, + { + "auxiliary_loss_clip": 0.01186374, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.05726063, + "balance_loss_mlp": 1.02073193, + "epoch": 0.45632176997535023, + "flos": 13589783602560.0, + "grad_norm": 1.6789033066313022, + "language_loss": 0.80015802, + "learning_rate": 2.3778534665726892e-06, + "loss": 0.82230949, + "num_input_tokens_seen": 81721155, + "step": 3795, + "time_per_iteration": 2.379777669906616 + }, + { + "auxiliary_loss_clip": 0.01159261, + "auxiliary_loss_mlp": 0.0103411, + "balance_loss_clip": 1.05166936, + "balance_loss_mlp": 1.0261054, + "epoch": 0.4564420128659893, + "flos": 32635401937920.0, + "grad_norm": 2.786734573177213, + "language_loss": 0.72400242, + "learning_rate": 2.377088494640168e-06, + "loss": 0.74593616, + "num_input_tokens_seen": 81742905, + "step": 3796, + "time_per_iteration": 2.5297863483428955 + }, + { + "auxiliary_loss_clip": 0.01164883, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.05331361, + "balance_loss_mlp": 1.02206349, + "epoch": 0.4565622557566284, + "flos": 20377690208640.0, + "grad_norm": 4.242723131690308, + "language_loss": 0.78136063, + "learning_rate": 2.3763234655036216e-06, + "loss": 0.80331087, + "num_input_tokens_seen": 81762105, + "step": 3797, + "time_per_iteration": 2.4528684616088867 + }, + { + "auxiliary_loss_clip": 0.01137969, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.04526961, + "balance_loss_mlp": 1.02058685, + "epoch": 0.45668249864726745, + "flos": 25374372364800.0, + "grad_norm": 2.0709775984358423, + "language_loss": 0.86847389, + "learning_rate": 2.3755583792791046e-06, + "loss": 0.89014214, + "num_input_tokens_seen": 81781975, + "step": 3798, + "time_per_iteration": 2.539468288421631 + }, + { + "auxiliary_loss_clip": 0.01168977, + "auxiliary_loss_mlp": 0.01024162, + "balance_loss_clip": 1.05152678, + "balance_loss_mlp": 1.01635385, + "epoch": 0.45680274153790656, + "flos": 15559806977280.0, + "grad_norm": 2.146732562241273, + "language_loss": 0.74601626, + "learning_rate": 2.3747932360826803e-06, + "loss": 0.76794761, + "num_input_tokens_seen": 81798905, + "step": 3799, + "time_per_iteration": 2.445308208465576 + }, + { + "auxiliary_loss_clip": 0.01169416, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.05378258, + "balance_loss_mlp": 1.02067089, + "epoch": 0.4569229844285457, + "flos": 19792884879360.0, + "grad_norm": 3.0735226956985437, + "language_loss": 0.8221522, + "learning_rate": 2.3740280360304205e-06, + "loss": 0.84414005, + "num_input_tokens_seen": 81816630, + "step": 3800, + "time_per_iteration": 2.4315550327301025 + }, + { + "auxiliary_loss_clip": 0.01140648, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.05232584, + "balance_loss_mlp": 1.02033925, + "epoch": 0.45704322731918473, + "flos": 24093941270400.0, + "grad_norm": 1.665226475130565, + "language_loss": 0.68208665, + "learning_rate": 2.3732627792384038e-06, + "loss": 0.70378417, + "num_input_tokens_seen": 81837700, + "step": 3801, + "time_per_iteration": 2.583601713180542 + }, + { + "auxiliary_loss_clip": 0.01184084, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.05372906, + "balance_loss_mlp": 1.01939917, + "epoch": 0.45716347020982384, + "flos": 31317803245440.0, + "grad_norm": 3.0154721142397674, + "language_loss": 0.75556082, + "learning_rate": 2.3724974658227207e-06, + "loss": 0.77768075, + "num_input_tokens_seen": 81858490, + "step": 3802, + "time_per_iteration": 2.497620105743408 + }, + { + "auxiliary_loss_clip": 0.01154197, + "auxiliary_loss_mlp": 0.00763119, + "balance_loss_clip": 1.05299437, + "balance_loss_mlp": 1.0003581, + "epoch": 0.45728371310046295, + "flos": 26501392471680.0, + "grad_norm": 2.28540710772405, + "language_loss": 0.71281683, + "learning_rate": 2.3717320958994687e-06, + "loss": 0.73199004, + "num_input_tokens_seen": 81876050, + "step": 3803, + "time_per_iteration": 3.362089157104492 + }, + { + "auxiliary_loss_clip": 0.01137497, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.04297101, + "balance_loss_mlp": 1.0193181, + "epoch": 0.457403955991102, + "flos": 17929408222080.0, + "grad_norm": 2.0194345487736043, + "language_loss": 0.70159721, + "learning_rate": 2.3709666695847534e-06, + "loss": 0.72324181, + "num_input_tokens_seen": 81894230, + "step": 3804, + "time_per_iteration": 2.48596453666687 + }, + { + "auxiliary_loss_clip": 0.01118354, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.04534924, + "balance_loss_mlp": 1.02048957, + "epoch": 0.4575241988817411, + "flos": 42230660837760.0, + "grad_norm": 1.5583618484445732, + "language_loss": 0.69771802, + "learning_rate": 2.370201186994689e-06, + "loss": 0.71918273, + "num_input_tokens_seen": 81917915, + "step": 3805, + "time_per_iteration": 2.735992670059204 + }, + { + "auxiliary_loss_clip": 0.01146528, + "auxiliary_loss_mlp": 0.01026787, + "balance_loss_clip": 1.05204535, + "balance_loss_mlp": 1.01884794, + "epoch": 0.45764444177238023, + "flos": 30117309868800.0, + "grad_norm": 2.056254989275629, + "language_loss": 0.69927001, + "learning_rate": 2.369435648245399e-06, + "loss": 0.72100317, + "num_input_tokens_seen": 81938130, + "step": 3806, + "time_per_iteration": 3.4126553535461426 + }, + { + "auxiliary_loss_clip": 0.01155552, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.05150104, + "balance_loss_mlp": 1.02960253, + "epoch": 0.4577646846630193, + "flos": 24060293205120.0, + "grad_norm": 1.679726526005014, + "language_loss": 0.84967977, + "learning_rate": 2.368670053453015e-06, + "loss": 0.87161839, + "num_input_tokens_seen": 81959820, + "step": 3807, + "time_per_iteration": 2.535785436630249 + }, + { + "auxiliary_loss_clip": 0.01178461, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.05724096, + "balance_loss_mlp": 1.02100909, + "epoch": 0.4578849275536584, + "flos": 17418578952960.0, + "grad_norm": 2.164443176570588, + "language_loss": 0.7445218, + "learning_rate": 2.3679044027336757e-06, + "loss": 0.76660591, + "num_input_tokens_seen": 81975710, + "step": 3808, + "time_per_iteration": 3.157886505126953 + }, + { + "auxiliary_loss_clip": 0.01183969, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.05366921, + "balance_loss_mlp": 1.01993835, + "epoch": 0.4580051704442975, + "flos": 13510169107200.0, + "grad_norm": 4.034110780753551, + "language_loss": 0.69298583, + "learning_rate": 2.3671386962035326e-06, + "loss": 0.71511364, + "num_input_tokens_seen": 81993180, + "step": 3809, + "time_per_iteration": 2.4340832233428955 + }, + { + "auxiliary_loss_clip": 0.01169907, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.05239487, + "balance_loss_mlp": 1.02232683, + "epoch": 0.45812541333493656, + "flos": 18037606965120.0, + "grad_norm": 1.9833936227468862, + "language_loss": 0.68344605, + "learning_rate": 2.3663729339787405e-06, + "loss": 0.70545149, + "num_input_tokens_seen": 82010115, + "step": 3810, + "time_per_iteration": 2.450411558151245 + }, + { + "auxiliary_loss_clip": 0.0118401, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.05341601, + "balance_loss_mlp": 1.01924682, + "epoch": 0.45824565622557567, + "flos": 20222196232320.0, + "grad_norm": 2.343059868702884, + "language_loss": 0.7365219, + "learning_rate": 2.365607116175466e-06, + "loss": 0.75863922, + "num_input_tokens_seen": 82025540, + "step": 3811, + "time_per_iteration": 3.1849732398986816 + }, + { + "auxiliary_loss_clip": 0.01181244, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.05337, + "balance_loss_mlp": 1.01740277, + "epoch": 0.4583658991162148, + "flos": 19864885691520.0, + "grad_norm": 2.6216325182817033, + "language_loss": 0.67094958, + "learning_rate": 2.3648412429098825e-06, + "loss": 0.69301629, + "num_input_tokens_seen": 82043890, + "step": 3812, + "time_per_iteration": 2.469998359680176 + }, + { + "auxiliary_loss_clip": 0.0113707, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.0495019, + "balance_loss_mlp": 1.02555776, + "epoch": 0.45848614200685384, + "flos": 21029935322880.0, + "grad_norm": 2.107825383099602, + "language_loss": 0.82321841, + "learning_rate": 2.364075314298172e-06, + "loss": 0.84493411, + "num_input_tokens_seen": 82061345, + "step": 3813, + "time_per_iteration": 2.5379202365875244 + }, + { + "auxiliary_loss_clip": 0.0117526, + "auxiliary_loss_mlp": 0.00763366, + "balance_loss_clip": 1.05519104, + "balance_loss_mlp": 1.0002799, + "epoch": 0.45860638489749295, + "flos": 21069293650560.0, + "grad_norm": 2.1540155947863795, + "language_loss": 0.70433772, + "learning_rate": 2.3633093304565267e-06, + "loss": 0.72372401, + "num_input_tokens_seen": 82080400, + "step": 3814, + "time_per_iteration": 2.4544529914855957 + }, + { + "auxiliary_loss_clip": 0.01188547, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.05673897, + "balance_loss_mlp": 1.02019429, + "epoch": 0.458726627788132, + "flos": 26833889692800.0, + "grad_norm": 1.9495933503349816, + "language_loss": 0.63341087, + "learning_rate": 2.3625432915011443e-06, + "loss": 0.65558445, + "num_input_tokens_seen": 82102310, + "step": 3815, + "time_per_iteration": 2.4853720664978027 + }, + { + "auxiliary_loss_clip": 0.01150028, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.05091357, + "balance_loss_mlp": 1.02560687, + "epoch": 0.4588468706787711, + "flos": 24097927680000.0, + "grad_norm": 1.6675263893726129, + "language_loss": 0.65215021, + "learning_rate": 2.3617771975482334e-06, + "loss": 0.67399061, + "num_input_tokens_seen": 82121140, + "step": 3816, + "time_per_iteration": 2.496014356613159 + }, + { + "auxiliary_loss_clip": 0.01120809, + "auxiliary_loss_mlp": 0.0102628, + "balance_loss_clip": 1.04783297, + "balance_loss_mlp": 1.01863861, + "epoch": 0.4589671135694102, + "flos": 17889331622400.0, + "grad_norm": 1.658072832036926, + "language_loss": 0.74629033, + "learning_rate": 2.3610110487140083e-06, + "loss": 0.76776111, + "num_input_tokens_seen": 82139575, + "step": 3817, + "time_per_iteration": 2.5430617332458496 + }, + { + "auxiliary_loss_clip": 0.01156699, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.0539124, + "balance_loss_mlp": 1.02509832, + "epoch": 0.4590873564600493, + "flos": 25626967781760.0, + "grad_norm": 1.794561486391742, + "language_loss": 0.8051284, + "learning_rate": 2.360244845114695e-06, + "loss": 0.82702994, + "num_input_tokens_seen": 82159195, + "step": 3818, + "time_per_iteration": 2.528165578842163 + }, + { + "auxiliary_loss_clip": 0.01150289, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.05344081, + "balance_loss_mlp": 1.01942766, + "epoch": 0.4592075993506884, + "flos": 18514788168960.0, + "grad_norm": 2.0803525924065585, + "language_loss": 0.68684983, + "learning_rate": 2.3594785868665245e-06, + "loss": 0.7086333, + "num_input_tokens_seen": 82175500, + "step": 3819, + "time_per_iteration": 2.473207712173462 + }, + { + "auxiliary_loss_clip": 0.01143613, + "auxiliary_loss_mlp": 0.00763172, + "balance_loss_clip": 1.05111611, + "balance_loss_mlp": 1.00028408, + "epoch": 0.4593278422413275, + "flos": 20631111638400.0, + "grad_norm": 2.3835872762282255, + "language_loss": 0.80651808, + "learning_rate": 2.3587122740857386e-06, + "loss": 0.82558596, + "num_input_tokens_seen": 82192600, + "step": 3820, + "time_per_iteration": 2.5180118083953857 + }, + { + "auxiliary_loss_clip": 0.01165594, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.0503968, + "balance_loss_mlp": 1.01959991, + "epoch": 0.45944808513196655, + "flos": 21358517961600.0, + "grad_norm": 1.7674697716118215, + "language_loss": 0.77810109, + "learning_rate": 2.357945906888586e-06, + "loss": 0.80003089, + "num_input_tokens_seen": 82212040, + "step": 3821, + "time_per_iteration": 2.4755611419677734 + }, + { + "auxiliary_loss_clip": 0.01172685, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05465913, + "balance_loss_mlp": 1.02368617, + "epoch": 0.45956832802260567, + "flos": 21427789340160.0, + "grad_norm": 6.857630860590178, + "language_loss": 0.79621851, + "learning_rate": 2.357179485391324e-06, + "loss": 0.81827199, + "num_input_tokens_seen": 82229895, + "step": 3822, + "time_per_iteration": 2.4633724689483643 + }, + { + "auxiliary_loss_clip": 0.01184048, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.05670071, + "balance_loss_mlp": 1.01808143, + "epoch": 0.4596885709132448, + "flos": 22382654538240.0, + "grad_norm": 3.3939572568218463, + "language_loss": 0.86541921, + "learning_rate": 2.3564130097102173e-06, + "loss": 0.88752359, + "num_input_tokens_seen": 82249550, + "step": 3823, + "time_per_iteration": 2.431030750274658 + }, + { + "auxiliary_loss_clip": 0.01148772, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.05376208, + "balance_loss_mlp": 1.01805544, + "epoch": 0.45980881380388383, + "flos": 28981957806720.0, + "grad_norm": 1.753479658580137, + "language_loss": 0.75157654, + "learning_rate": 2.355646479961541e-06, + "loss": 0.77333236, + "num_input_tokens_seen": 82268860, + "step": 3824, + "time_per_iteration": 2.6220083236694336 + }, + { + "auxiliary_loss_clip": 0.01182269, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.05386233, + "balance_loss_mlp": 1.01755476, + "epoch": 0.45992905669452294, + "flos": 33396599980800.0, + "grad_norm": 2.1645082455652007, + "language_loss": 0.71528888, + "learning_rate": 2.354879896261576e-06, + "loss": 0.73737764, + "num_input_tokens_seen": 82289070, + "step": 3825, + "time_per_iteration": 2.5155882835388184 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.05234957, + "balance_loss_mlp": 1.02254975, + "epoch": 0.46004929958516205, + "flos": 36318184502400.0, + "grad_norm": 1.8018786347648352, + "language_loss": 0.56658053, + "learning_rate": 2.3541132587266133e-06, + "loss": 0.58827055, + "num_input_tokens_seen": 82311790, + "step": 3826, + "time_per_iteration": 2.650313377380371 + }, + { + "auxiliary_loss_clip": 0.01147093, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.0502224, + "balance_loss_mlp": 1.01831234, + "epoch": 0.4601695424758011, + "flos": 17238451224960.0, + "grad_norm": 1.9524804197617511, + "language_loss": 0.69235218, + "learning_rate": 2.3533465674729515e-06, + "loss": 0.71408916, + "num_input_tokens_seen": 82329020, + "step": 3827, + "time_per_iteration": 2.5046355724334717 + }, + { + "auxiliary_loss_clip": 0.01184545, + "auxiliary_loss_mlp": 0.01030169, + "balance_loss_clip": 1.05534017, + "balance_loss_mlp": 1.0213778, + "epoch": 0.4602897853664402, + "flos": 15888425529600.0, + "grad_norm": 1.9207453384285458, + "language_loss": 0.72819066, + "learning_rate": 2.352579822616895e-06, + "loss": 0.75033784, + "num_input_tokens_seen": 82346455, + "step": 3828, + "time_per_iteration": 2.4058139324188232 + }, + { + "auxiliary_loss_clip": 0.01157537, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.05190682, + "balance_loss_mlp": 1.01799881, + "epoch": 0.4604100282570793, + "flos": 25412617370880.0, + "grad_norm": 1.68594507157666, + "language_loss": 0.77542496, + "learning_rate": 2.351813024274761e-06, + "loss": 0.7972604, + "num_input_tokens_seen": 82367810, + "step": 3829, + "time_per_iteration": 3.362781286239624 + }, + { + "auxiliary_loss_clip": 0.01145407, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.05089378, + "balance_loss_mlp": 1.02305484, + "epoch": 0.4605302711477184, + "flos": 27630711048960.0, + "grad_norm": 1.7564613473035389, + "language_loss": 0.73894072, + "learning_rate": 2.3510461725628693e-06, + "loss": 0.76070833, + "num_input_tokens_seen": 82388275, + "step": 3830, + "time_per_iteration": 2.5806941986083984 + }, + { + "auxiliary_loss_clip": 0.01143263, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.05027628, + "balance_loss_mlp": 1.02035475, + "epoch": 0.4606505140383575, + "flos": 23839657914240.0, + "grad_norm": 1.7831035193181826, + "language_loss": 0.71025854, + "learning_rate": 2.350279267597554e-06, + "loss": 0.73197734, + "num_input_tokens_seen": 82408915, + "step": 3831, + "time_per_iteration": 2.596791982650757 + }, + { + "auxiliary_loss_clip": 0.01171302, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.05531764, + "balance_loss_mlp": 1.02201009, + "epoch": 0.46077075692899655, + "flos": 16107013745280.0, + "grad_norm": 2.1336997328389207, + "language_loss": 0.82908005, + "learning_rate": 2.3495123094951515e-06, + "loss": 0.8510983, + "num_input_tokens_seen": 82427260, + "step": 3832, + "time_per_iteration": 2.4435081481933594 + }, + { + "auxiliary_loss_clip": 0.01148236, + "auxiliary_loss_mlp": 0.01023938, + "balance_loss_clip": 1.05136442, + "balance_loss_mlp": 1.01548648, + "epoch": 0.46089099981963566, + "flos": 48798147634560.0, + "grad_norm": 1.9922786381061162, + "language_loss": 0.75751519, + "learning_rate": 2.34874529837201e-06, + "loss": 0.77923691, + "num_input_tokens_seen": 82450805, + "step": 3833, + "time_per_iteration": 3.5840744972229004 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01022866, + "balance_loss_clip": 1.04419804, + "balance_loss_mlp": 1.01472449, + "epoch": 0.46101124271027477, + "flos": 19099234362240.0, + "grad_norm": 2.171825253013662, + "language_loss": 0.79380012, + "learning_rate": 2.347978234344483e-06, + "loss": 0.81510437, + "num_input_tokens_seen": 82467010, + "step": 3834, + "time_per_iteration": 2.5585708618164062 + }, + { + "auxiliary_loss_clip": 0.01173899, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.0553906, + "balance_loss_mlp": 1.02728498, + "epoch": 0.4611314856009138, + "flos": 39347931853440.0, + "grad_norm": 1.8323993137472896, + "language_loss": 0.69013137, + "learning_rate": 2.347211117528935e-06, + "loss": 0.71223503, + "num_input_tokens_seen": 82489310, + "step": 3835, + "time_per_iteration": 3.407909393310547 + }, + { + "auxiliary_loss_clip": 0.01150674, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.05464113, + "balance_loss_mlp": 1.02174199, + "epoch": 0.46125172849155294, + "flos": 20810772489600.0, + "grad_norm": 1.5469928731392724, + "language_loss": 0.71623951, + "learning_rate": 2.3464439480417374e-06, + "loss": 0.73804832, + "num_input_tokens_seen": 82508830, + "step": 3836, + "time_per_iteration": 2.569108724594116 + }, + { + "auxiliary_loss_clip": 0.01171436, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.0537746, + "balance_loss_mlp": 1.0240624, + "epoch": 0.46137197138219205, + "flos": 17930808852480.0, + "grad_norm": 2.350554078955025, + "language_loss": 0.77454567, + "learning_rate": 2.3456767259992676e-06, + "loss": 0.79658628, + "num_input_tokens_seen": 82526475, + "step": 3837, + "time_per_iteration": 2.458272933959961 + }, + { + "auxiliary_loss_clip": 0.0118257, + "auxiliary_loss_mlp": 0.00763591, + "balance_loss_clip": 1.05277205, + "balance_loss_mlp": 1.00021482, + "epoch": 0.4614922142728311, + "flos": 16836610798080.0, + "grad_norm": 2.6381895017292294, + "language_loss": 0.88564372, + "learning_rate": 2.3449094515179135e-06, + "loss": 0.90510535, + "num_input_tokens_seen": 82543935, + "step": 3838, + "time_per_iteration": 3.176616907119751 + }, + { + "auxiliary_loss_clip": 0.01160292, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.05135155, + "balance_loss_mlp": 1.01922071, + "epoch": 0.4616124571634702, + "flos": 26614906427520.0, + "grad_norm": 1.6624241672844116, + "language_loss": 0.81702513, + "learning_rate": 2.34414212471407e-06, + "loss": 0.83890796, + "num_input_tokens_seen": 82563730, + "step": 3839, + "time_per_iteration": 2.5248005390167236 + }, + { + "auxiliary_loss_clip": 0.01176128, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.05352831, + "balance_loss_mlp": 1.01817524, + "epoch": 0.4617327000541093, + "flos": 20340127560960.0, + "grad_norm": 1.939585706846273, + "language_loss": 0.72500026, + "learning_rate": 2.3433747457041394e-06, + "loss": 0.74702775, + "num_input_tokens_seen": 82582435, + "step": 3840, + "time_per_iteration": 2.439389228820801 + }, + { + "auxiliary_loss_clip": 0.01143336, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.05171943, + "balance_loss_mlp": 1.02132726, + "epoch": 0.4618529429447484, + "flos": 29570749545600.0, + "grad_norm": 1.9176713613126162, + "language_loss": 0.84948707, + "learning_rate": 2.342607314604533e-06, + "loss": 0.87122405, + "num_input_tokens_seen": 82602185, + "step": 3841, + "time_per_iteration": 2.584968090057373 + }, + { + "auxiliary_loss_clip": 0.01172325, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.05704713, + "balance_loss_mlp": 1.02339697, + "epoch": 0.4619731858353875, + "flos": 19787030962560.0, + "grad_norm": 1.8793781865567267, + "language_loss": 0.83882046, + "learning_rate": 2.3418398315316694e-06, + "loss": 0.86086166, + "num_input_tokens_seen": 82620005, + "step": 3842, + "time_per_iteration": 2.4580259323120117 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.05766439, + "balance_loss_mlp": 1.03060961, + "epoch": 0.4620934287260266, + "flos": 18951138587520.0, + "grad_norm": 2.2831325224672594, + "language_loss": 0.77869642, + "learning_rate": 2.3410722966019755e-06, + "loss": 0.8009457, + "num_input_tokens_seen": 82635120, + "step": 3843, + "time_per_iteration": 2.396442174911499 + }, + { + "auxiliary_loss_clip": 0.01168172, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.0526011, + "balance_loss_mlp": 1.01926446, + "epoch": 0.46221367161666566, + "flos": 37341674634240.0, + "grad_norm": 1.85605789797141, + "language_loss": 0.65926111, + "learning_rate": 2.3403047099318848e-06, + "loss": 0.68122143, + "num_input_tokens_seen": 82659190, + "step": 3844, + "time_per_iteration": 2.5998647212982178 + }, + { + "auxiliary_loss_clip": 0.01122707, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.0470562, + "balance_loss_mlp": 1.01669991, + "epoch": 0.46233391450730477, + "flos": 14428549065600.0, + "grad_norm": 2.099728382360556, + "language_loss": 0.75131309, + "learning_rate": 2.3395370716378405e-06, + "loss": 0.77278888, + "num_input_tokens_seen": 82676635, + "step": 3845, + "time_per_iteration": 2.537075996398926 + }, + { + "auxiliary_loss_clip": 0.01173542, + "auxiliary_loss_mlp": 0.01032499, + "balance_loss_clip": 1.05349278, + "balance_loss_mlp": 1.02434552, + "epoch": 0.4624541573979438, + "flos": 22493044010880.0, + "grad_norm": 2.342088528436429, + "language_loss": 0.72600794, + "learning_rate": 2.338769381836292e-06, + "loss": 0.74806833, + "num_input_tokens_seen": 82696245, + "step": 3846, + "time_per_iteration": 2.4618353843688965 + }, + { + "auxiliary_loss_clip": 0.01140005, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.05322385, + "balance_loss_mlp": 1.02590084, + "epoch": 0.46257440028858293, + "flos": 14465070218880.0, + "grad_norm": 2.3751685649516583, + "language_loss": 0.73069572, + "learning_rate": 2.3380016406436984e-06, + "loss": 0.75243831, + "num_input_tokens_seen": 82713725, + "step": 3847, + "time_per_iteration": 2.506568193435669 + }, + { + "auxiliary_loss_clip": 0.01129059, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.05363083, + "balance_loss_mlp": 1.02552223, + "epoch": 0.46269464317922204, + "flos": 23332204523520.0, + "grad_norm": 2.1321927862599144, + "language_loss": 0.8112278, + "learning_rate": 2.337233848176524e-06, + "loss": 0.8328656, + "num_input_tokens_seen": 82731495, + "step": 3848, + "time_per_iteration": 2.5549581050872803 + }, + { + "auxiliary_loss_clip": 0.01121183, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.04880321, + "balance_loss_mlp": 1.02044988, + "epoch": 0.4628148860698611, + "flos": 18552027594240.0, + "grad_norm": 2.0328690964437954, + "language_loss": 0.8365525, + "learning_rate": 2.3364660045512435e-06, + "loss": 0.85805136, + "num_input_tokens_seen": 82750255, + "step": 3849, + "time_per_iteration": 2.545180082321167 + }, + { + "auxiliary_loss_clip": 0.01062477, + "auxiliary_loss_mlp": 0.01003105, + "balance_loss_clip": 1.02090359, + "balance_loss_mlp": 1.002038, + "epoch": 0.4629351289605002, + "flos": 70667569670400.0, + "grad_norm": 1.0325058665732432, + "language_loss": 0.58241451, + "learning_rate": 2.335698109884337e-06, + "loss": 0.60307026, + "num_input_tokens_seen": 82815460, + "step": 3850, + "time_per_iteration": 3.229626178741455 + }, + { + "auxiliary_loss_clip": 0.01050997, + "auxiliary_loss_mlp": 0.01003792, + "balance_loss_clip": 1.03229666, + "balance_loss_mlp": 1.00217104, + "epoch": 0.4630553718511393, + "flos": 59687200465920.0, + "grad_norm": 0.7892037359808743, + "language_loss": 0.59842908, + "learning_rate": 2.334930164292294e-06, + "loss": 0.61897695, + "num_input_tokens_seen": 82878010, + "step": 3851, + "time_per_iteration": 3.260227918624878 + }, + { + "auxiliary_loss_clip": 0.01120033, + "auxiliary_loss_mlp": 0.01028735, + "balance_loss_clip": 1.04607761, + "balance_loss_mlp": 1.02115333, + "epoch": 0.4631756147417784, + "flos": 15960605909760.0, + "grad_norm": 2.0209326958860667, + "language_loss": 0.79903626, + "learning_rate": 2.334162167891612e-06, + "loss": 0.82052398, + "num_input_tokens_seen": 82895275, + "step": 3852, + "time_per_iteration": 2.555870294570923 + }, + { + "auxiliary_loss_clip": 0.01157046, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.05050707, + "balance_loss_mlp": 1.02332997, + "epoch": 0.4632958576324175, + "flos": 16472907636480.0, + "grad_norm": 2.4383968191116066, + "language_loss": 0.75190365, + "learning_rate": 2.333394120798795e-06, + "loss": 0.77379799, + "num_input_tokens_seen": 82914010, + "step": 3853, + "time_per_iteration": 2.510403871536255 + }, + { + "auxiliary_loss_clip": 0.01153787, + "auxiliary_loss_mlp": 0.01023884, + "balance_loss_clip": 1.0492934, + "balance_loss_mlp": 1.01502132, + "epoch": 0.4634161005230566, + "flos": 22346492520960.0, + "grad_norm": 2.247237940543874, + "language_loss": 0.71964312, + "learning_rate": 2.3326260231303545e-06, + "loss": 0.74141979, + "num_input_tokens_seen": 82932610, + "step": 3854, + "time_per_iteration": 2.5033962726593018 + }, + { + "auxiliary_loss_clip": 0.01182588, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.05686522, + "balance_loss_mlp": 1.01766622, + "epoch": 0.46353634341369565, + "flos": 15742233175680.0, + "grad_norm": 1.6235630507052292, + "language_loss": 0.86317807, + "learning_rate": 2.331857875002811e-06, + "loss": 0.88526136, + "num_input_tokens_seen": 82951210, + "step": 3855, + "time_per_iteration": 2.417605400085449 + }, + { + "auxiliary_loss_clip": 0.01157342, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.05550969, + "balance_loss_mlp": 1.02400088, + "epoch": 0.46365658630433476, + "flos": 28329820433280.0, + "grad_norm": 1.6870892582817414, + "language_loss": 0.75995284, + "learning_rate": 2.3310896765326916e-06, + "loss": 0.7818476, + "num_input_tokens_seen": 82972210, + "step": 3856, + "time_per_iteration": 3.3949105739593506 + }, + { + "auxiliary_loss_clip": 0.01138665, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.05045128, + "balance_loss_mlp": 1.02427781, + "epoch": 0.46377682919497387, + "flos": 24608074590720.0, + "grad_norm": 1.5815510143294145, + "language_loss": 0.84264368, + "learning_rate": 2.330321427836531e-06, + "loss": 0.86436093, + "num_input_tokens_seen": 82994080, + "step": 3857, + "time_per_iteration": 2.5600061416625977 + }, + { + "auxiliary_loss_clip": 0.01165121, + "auxiliary_loss_mlp": 0.01026022, + "balance_loss_clip": 1.05233979, + "balance_loss_mlp": 1.01702785, + "epoch": 0.4638970720856129, + "flos": 19060953442560.0, + "grad_norm": 1.605440585062139, + "language_loss": 0.82842326, + "learning_rate": 2.3295531290308733e-06, + "loss": 0.8503347, + "num_input_tokens_seen": 83012230, + "step": 3858, + "time_per_iteration": 2.44889760017395 + }, + { + "auxiliary_loss_clip": 0.01185408, + "auxiliary_loss_mlp": 0.00763224, + "balance_loss_clip": 1.05544567, + "balance_loss_mlp": 1.00024676, + "epoch": 0.46401731497625204, + "flos": 18471012468480.0, + "grad_norm": 2.515361157905502, + "language_loss": 0.75979018, + "learning_rate": 2.3287847802322678e-06, + "loss": 0.77927655, + "num_input_tokens_seen": 83027800, + "step": 3859, + "time_per_iteration": 2.388307571411133 + }, + { + "auxiliary_loss_clip": 0.01164749, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.05549955, + "balance_loss_mlp": 1.01785541, + "epoch": 0.4641375578668911, + "flos": 26067053214720.0, + "grad_norm": 1.730979245222694, + "language_loss": 0.83645052, + "learning_rate": 2.3280163815572723e-06, + "loss": 0.85836411, + "num_input_tokens_seen": 83048395, + "step": 3860, + "time_per_iteration": 3.386744499206543 + }, + { + "auxiliary_loss_clip": 0.01146104, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.04920149, + "balance_loss_mlp": 1.01835442, + "epoch": 0.4642578007575302, + "flos": 19570382081280.0, + "grad_norm": 2.2315802115092533, + "language_loss": 0.76647878, + "learning_rate": 2.3272479331224522e-06, + "loss": 0.78820431, + "num_input_tokens_seen": 83065825, + "step": 3861, + "time_per_iteration": 2.4834401607513428 + }, + { + "auxiliary_loss_clip": 0.01185677, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.05557144, + "balance_loss_mlp": 1.01763952, + "epoch": 0.4643780436481693, + "flos": 28186249772160.0, + "grad_norm": 1.659882677175064, + "language_loss": 0.77978754, + "learning_rate": 2.3264794350443817e-06, + "loss": 0.80190051, + "num_input_tokens_seen": 83087920, + "step": 3862, + "time_per_iteration": 3.3304646015167236 + }, + { + "auxiliary_loss_clip": 0.01170686, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.05011213, + "balance_loss_mlp": 1.01621449, + "epoch": 0.46449828653880837, + "flos": 25375270204800.0, + "grad_norm": 1.8237793170602534, + "language_loss": 0.78528023, + "learning_rate": 2.3257108874396396e-06, + "loss": 0.80723143, + "num_input_tokens_seen": 83109015, + "step": 3863, + "time_per_iteration": 2.516768217086792 + }, + { + "auxiliary_loss_clip": 0.01155008, + "auxiliary_loss_mlp": 0.01037081, + "balance_loss_clip": 1.0512743, + "balance_loss_mlp": 1.02834868, + "epoch": 0.4646185294294475, + "flos": 16034330574720.0, + "grad_norm": 1.9480766458141583, + "language_loss": 0.73751605, + "learning_rate": 2.3249422904248152e-06, + "loss": 0.75943691, + "num_input_tokens_seen": 83127450, + "step": 3864, + "time_per_iteration": 3.2613096237182617 + }, + { + "auxiliary_loss_clip": 0.01171798, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.0532186, + "balance_loss_mlp": 1.02071977, + "epoch": 0.4647387723200866, + "flos": 26363101109760.0, + "grad_norm": 1.4045701801822494, + "language_loss": 0.86915839, + "learning_rate": 2.324173644116504e-06, + "loss": 0.89116168, + "num_input_tokens_seen": 83150300, + "step": 3865, + "time_per_iteration": 2.5376906394958496 + }, + { + "auxiliary_loss_clip": 0.01165689, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.0545218, + "balance_loss_mlp": 1.01884961, + "epoch": 0.46485901521072565, + "flos": 27160209774720.0, + "grad_norm": 2.0221487893706342, + "language_loss": 0.81538665, + "learning_rate": 2.3234049486313087e-06, + "loss": 0.83730888, + "num_input_tokens_seen": 83171750, + "step": 3866, + "time_per_iteration": 2.511503219604492 + }, + { + "auxiliary_loss_clip": 0.01166536, + "auxiliary_loss_mlp": 0.01026238, + "balance_loss_clip": 1.05282724, + "balance_loss_mlp": 1.01875186, + "epoch": 0.46497925810136476, + "flos": 24279851088000.0, + "grad_norm": 2.5105874711915845, + "language_loss": 0.75770646, + "learning_rate": 2.322636204085839e-06, + "loss": 0.77963424, + "num_input_tokens_seen": 83191820, + "step": 3867, + "time_per_iteration": 2.4915366172790527 + }, + { + "auxiliary_loss_clip": 0.01144801, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.04699802, + "balance_loss_mlp": 1.02395701, + "epoch": 0.46509950099200387, + "flos": 16253134272000.0, + "grad_norm": 2.1503541626242586, + "language_loss": 0.78623295, + "learning_rate": 2.3218674105967143e-06, + "loss": 0.80800462, + "num_input_tokens_seen": 83210085, + "step": 3868, + "time_per_iteration": 2.469186544418335 + }, + { + "auxiliary_loss_clip": 0.01148773, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.04975271, + "balance_loss_mlp": 1.02129507, + "epoch": 0.4652197438826429, + "flos": 23442270773760.0, + "grad_norm": 1.5589792835783813, + "language_loss": 0.8346957, + "learning_rate": 2.3210985682805593e-06, + "loss": 0.85647738, + "num_input_tokens_seen": 83231865, + "step": 3869, + "time_per_iteration": 2.558034658432007 + }, + { + "auxiliary_loss_clip": 0.01185475, + "auxiliary_loss_mlp": 0.01026578, + "balance_loss_clip": 1.05710042, + "balance_loss_mlp": 1.01839995, + "epoch": 0.46533998677328203, + "flos": 16216397637120.0, + "grad_norm": 2.2559173279609803, + "language_loss": 0.68302858, + "learning_rate": 2.320329677254007e-06, + "loss": 0.70514911, + "num_input_tokens_seen": 83249195, + "step": 3870, + "time_per_iteration": 2.3962395191192627 + }, + { + "auxiliary_loss_clip": 0.0118221, + "auxiliary_loss_mlp": 0.01026913, + "balance_loss_clip": 1.05451286, + "balance_loss_mlp": 1.01862216, + "epoch": 0.46546022966392114, + "flos": 21141869080320.0, + "grad_norm": 2.7025901157905556, + "language_loss": 0.72160763, + "learning_rate": 2.319560737633697e-06, + "loss": 0.74369889, + "num_input_tokens_seen": 83267915, + "step": 3871, + "time_per_iteration": 2.4524593353271484 + }, + { + "auxiliary_loss_clip": 0.011467, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.04927289, + "balance_loss_mlp": 1.01963747, + "epoch": 0.4655804725545602, + "flos": 41171942442240.0, + "grad_norm": 1.7049153411179157, + "language_loss": 0.6811558, + "learning_rate": 2.3187917495362775e-06, + "loss": 0.70290715, + "num_input_tokens_seen": 83292325, + "step": 3872, + "time_per_iteration": 2.7060701847076416 + }, + { + "auxiliary_loss_clip": 0.01126992, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.05036211, + "balance_loss_mlp": 1.0270648, + "epoch": 0.4657007154451993, + "flos": 19570956698880.0, + "grad_norm": 2.5185733140767805, + "language_loss": 0.77119905, + "learning_rate": 2.318022713078403e-06, + "loss": 0.79282498, + "num_input_tokens_seen": 83306905, + "step": 3873, + "time_per_iteration": 2.528513193130493 + }, + { + "auxiliary_loss_clip": 0.01154149, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.05237174, + "balance_loss_mlp": 1.02481437, + "epoch": 0.4658209583358384, + "flos": 15517826956800.0, + "grad_norm": 2.1639419508203273, + "language_loss": 0.84712434, + "learning_rate": 2.3172536283767354e-06, + "loss": 0.86899447, + "num_input_tokens_seen": 83320665, + "step": 3874, + "time_per_iteration": 2.446023464202881 + }, + { + "auxiliary_loss_clip": 0.01138724, + "auxiliary_loss_mlp": 0.01025478, + "balance_loss_clip": 1.05147064, + "balance_loss_mlp": 1.01665688, + "epoch": 0.4659412012264775, + "flos": 14903180403840.0, + "grad_norm": 2.42409469856878, + "language_loss": 0.80850017, + "learning_rate": 2.3164844955479447e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 83336475, + "step": 3875, + "time_per_iteration": 2.4981679916381836 + }, + { + "auxiliary_loss_clip": 0.01136046, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.04999542, + "balance_loss_mlp": 1.01835036, + "epoch": 0.4660614441171166, + "flos": 24425612478720.0, + "grad_norm": 1.6047061721604563, + "language_loss": 0.70705074, + "learning_rate": 2.3157153147087082e-06, + "loss": 0.7286799, + "num_input_tokens_seen": 83358365, + "step": 3876, + "time_per_iteration": 2.606342315673828 + }, + { + "auxiliary_loss_clip": 0.01135429, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.0527494, + "balance_loss_mlp": 1.01957297, + "epoch": 0.46618168700775564, + "flos": 22091095843200.0, + "grad_norm": 1.8187304219263447, + "language_loss": 0.83066249, + "learning_rate": 2.314946085975709e-06, + "loss": 0.85229158, + "num_input_tokens_seen": 83377345, + "step": 3877, + "time_per_iteration": 2.5547399520874023 + }, + { + "auxiliary_loss_clip": 0.01134007, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.0530827, + "balance_loss_mlp": 1.02146959, + "epoch": 0.46630192989839475, + "flos": 26176975810560.0, + "grad_norm": 1.6979508638888605, + "language_loss": 0.82295507, + "learning_rate": 2.3141768094656393e-06, + "loss": 0.84459186, + "num_input_tokens_seen": 83395920, + "step": 3878, + "time_per_iteration": 2.5586094856262207 + }, + { + "auxiliary_loss_clip": 0.01105843, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.04454708, + "balance_loss_mlp": 1.02097595, + "epoch": 0.46642217278903386, + "flos": 11509622150400.0, + "grad_norm": 2.3558260735724907, + "language_loss": 0.82952344, + "learning_rate": 2.3134074852951966e-06, + "loss": 0.85087067, + "num_input_tokens_seen": 83412510, + "step": 3879, + "time_per_iteration": 2.580610513687134 + }, + { + "auxiliary_loss_clip": 0.01119884, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.04363728, + "balance_loss_mlp": 1.02316689, + "epoch": 0.4665424156796729, + "flos": 32306819299200.0, + "grad_norm": 1.6116375456185172, + "language_loss": 0.77827567, + "learning_rate": 2.312638113581088e-06, + "loss": 0.79978389, + "num_input_tokens_seen": 83432995, + "step": 3880, + "time_per_iteration": 2.635580062866211 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.05089295, + "balance_loss_mlp": 1.02157664, + "epoch": 0.46666265857031203, + "flos": 18436179254400.0, + "grad_norm": 2.573096329465983, + "language_loss": 0.77883887, + "learning_rate": 2.311868694440027e-06, + "loss": 0.80081832, + "num_input_tokens_seen": 83447415, + "step": 3881, + "time_per_iteration": 2.4474406242370605 + }, + { + "auxiliary_loss_clip": 0.01086467, + "auxiliary_loss_mlp": 0.01004618, + "balance_loss_clip": 1.02231169, + "balance_loss_mlp": 1.00350344, + "epoch": 0.46678290146095114, + "flos": 68438989221120.0, + "grad_norm": 0.732041733792607, + "language_loss": 0.62523288, + "learning_rate": 2.3110992279887323e-06, + "loss": 0.64614373, + "num_input_tokens_seen": 83519340, + "step": 3882, + "time_per_iteration": 3.1565401554107666 + }, + { + "auxiliary_loss_clip": 0.01147862, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.05331504, + "balance_loss_mlp": 1.02008343, + "epoch": 0.4669031443515902, + "flos": 17712507945600.0, + "grad_norm": 2.2361374047629696, + "language_loss": 0.84642488, + "learning_rate": 2.310329714343932e-06, + "loss": 0.86818814, + "num_input_tokens_seen": 83535490, + "step": 3883, + "time_per_iteration": 3.3573288917541504 + }, + { + "auxiliary_loss_clip": 0.01150141, + "auxiliary_loss_mlp": 0.01023739, + "balance_loss_clip": 1.05067158, + "balance_loss_mlp": 1.01568651, + "epoch": 0.4670233872422293, + "flos": 23947748916480.0, + "grad_norm": 2.0114157679523506, + "language_loss": 0.8227917, + "learning_rate": 2.309560153622361e-06, + "loss": 0.84453058, + "num_input_tokens_seen": 83552400, + "step": 3884, + "time_per_iteration": 2.5054590702056885 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.05041337, + "balance_loss_mlp": 1.02079654, + "epoch": 0.4671436301328684, + "flos": 28111268131200.0, + "grad_norm": 1.907342007010759, + "language_loss": 0.74632037, + "learning_rate": 2.3087905459407602e-06, + "loss": 0.76801848, + "num_input_tokens_seen": 83571340, + "step": 3885, + "time_per_iteration": 2.583822250366211 + }, + { + "auxiliary_loss_clip": 0.01074645, + "auxiliary_loss_mlp": 0.010012, + "balance_loss_clip": 1.01950741, + "balance_loss_mlp": 1.00006795, + "epoch": 0.46726387302350747, + "flos": 69369684566400.0, + "grad_norm": 0.7899103192596285, + "language_loss": 0.62985563, + "learning_rate": 2.3080208914158795e-06, + "loss": 0.65061414, + "num_input_tokens_seen": 83634340, + "step": 3886, + "time_per_iteration": 3.9647345542907715 + }, + { + "auxiliary_loss_clip": 0.01153786, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.05349183, + "balance_loss_mlp": 1.01876879, + "epoch": 0.4673841159141466, + "flos": 25519666878720.0, + "grad_norm": 2.180772374069027, + "language_loss": 0.72144479, + "learning_rate": 2.3072511901644753e-06, + "loss": 0.74324906, + "num_input_tokens_seen": 83653410, + "step": 3887, + "time_per_iteration": 2.528275489807129 + }, + { + "auxiliary_loss_clip": 0.01183669, + "auxiliary_loss_mlp": 0.01024972, + "balance_loss_clip": 1.05625856, + "balance_loss_mlp": 1.01689017, + "epoch": 0.4675043588047857, + "flos": 24499265316480.0, + "grad_norm": 1.7702952062515918, + "language_loss": 0.80995989, + "learning_rate": 2.306481442303309e-06, + "loss": 0.83204627, + "num_input_tokens_seen": 83672985, + "step": 3888, + "time_per_iteration": 3.247695207595825 + }, + { + "auxiliary_loss_clip": 0.01170851, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.05205393, + "balance_loss_mlp": 1.01982522, + "epoch": 0.46762460169542475, + "flos": 20960771685120.0, + "grad_norm": 1.885880303298299, + "language_loss": 0.73227996, + "learning_rate": 2.3057116479491515e-06, + "loss": 0.75427294, + "num_input_tokens_seen": 83692395, + "step": 3889, + "time_per_iteration": 2.4668185710906982 + }, + { + "auxiliary_loss_clip": 0.01163405, + "auxiliary_loss_mlp": 0.01029331, + "balance_loss_clip": 1.04871678, + "balance_loss_mlp": 1.0213145, + "epoch": 0.46774484458606386, + "flos": 19171666137600.0, + "grad_norm": 1.8307758839218344, + "language_loss": 0.76158923, + "learning_rate": 2.30494180721878e-06, + "loss": 0.78351659, + "num_input_tokens_seen": 83709735, + "step": 3890, + "time_per_iteration": 2.437392234802246 + }, + { + "auxiliary_loss_clip": 0.01166558, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.0514648, + "balance_loss_mlp": 1.02476072, + "epoch": 0.4678650874767029, + "flos": 17967689141760.0, + "grad_norm": 1.984678040888482, + "language_loss": 0.89754528, + "learning_rate": 2.3041719202289794e-06, + "loss": 0.91953576, + "num_input_tokens_seen": 83725910, + "step": 3891, + "time_per_iteration": 3.1704788208007812 + }, + { + "auxiliary_loss_clip": 0.01168691, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.05288756, + "balance_loss_mlp": 1.02413988, + "epoch": 0.467985330367342, + "flos": 21360816432000.0, + "grad_norm": 1.664477723877608, + "language_loss": 0.80685574, + "learning_rate": 2.30340198709654e-06, + "loss": 0.82886505, + "num_input_tokens_seen": 83745745, + "step": 3892, + "time_per_iteration": 2.475205659866333 + }, + { + "auxiliary_loss_clip": 0.01157089, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.04808879, + "balance_loss_mlp": 1.02253628, + "epoch": 0.46810557325798113, + "flos": 20521835487360.0, + "grad_norm": 2.178028118512589, + "language_loss": 0.74731648, + "learning_rate": 2.3026320079382605e-06, + "loss": 0.76919258, + "num_input_tokens_seen": 83762680, + "step": 3893, + "time_per_iteration": 2.478456974029541 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01024631, + "balance_loss_clip": 1.05420351, + "balance_loss_mlp": 1.01641786, + "epoch": 0.4682258161486202, + "flos": 30117848572800.0, + "grad_norm": 1.9910731750170454, + "language_loss": 0.76259762, + "learning_rate": 2.3018619828709454e-06, + "loss": 0.78465819, + "num_input_tokens_seen": 83784220, + "step": 3894, + "time_per_iteration": 2.4849154949188232 + }, + { + "auxiliary_loss_clip": 0.01165536, + "auxiliary_loss_mlp": 0.00762786, + "balance_loss_clip": 1.05523419, + "balance_loss_mlp": 1.00016975, + "epoch": 0.4683460590392593, + "flos": 25293357239040.0, + "grad_norm": 1.941504932459736, + "language_loss": 0.8214823, + "learning_rate": 2.3010919120114084e-06, + "loss": 0.84076548, + "num_input_tokens_seen": 83800750, + "step": 3895, + "time_per_iteration": 2.471978187561035 + }, + { + "auxiliary_loss_clip": 0.01162205, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.04592657, + "balance_loss_mlp": 1.02428472, + "epoch": 0.4684663019298984, + "flos": 15368330551680.0, + "grad_norm": 2.936894641281481, + "language_loss": 0.66227627, + "learning_rate": 2.3003217954764672e-06, + "loss": 0.6842196, + "num_input_tokens_seen": 83815455, + "step": 3896, + "time_per_iteration": 2.399125814437866 + }, + { + "auxiliary_loss_clip": 0.01168341, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.04838705, + "balance_loss_mlp": 1.01792777, + "epoch": 0.46858654482053747, + "flos": 27778842737280.0, + "grad_norm": 2.183958799472053, + "language_loss": 0.7964617, + "learning_rate": 2.299551633382949e-06, + "loss": 0.81840628, + "num_input_tokens_seen": 83835765, + "step": 3897, + "time_per_iteration": 2.498365879058838 + }, + { + "auxiliary_loss_clip": 0.01146293, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.04847121, + "balance_loss_mlp": 1.02112532, + "epoch": 0.4687067877111766, + "flos": 18040623707520.0, + "grad_norm": 1.8873209137383269, + "language_loss": 0.85583794, + "learning_rate": 2.2987814258476854e-06, + "loss": 0.87759262, + "num_input_tokens_seen": 83853565, + "step": 3898, + "time_per_iteration": 2.459045886993408 + }, + { + "auxiliary_loss_clip": 0.01126318, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.04328096, + "balance_loss_mlp": 1.01698685, + "epoch": 0.4688270306018157, + "flos": 16977380198400.0, + "grad_norm": 2.3232820564551893, + "language_loss": 0.67809123, + "learning_rate": 2.2980111729875177e-06, + "loss": 0.69960535, + "num_input_tokens_seen": 83869815, + "step": 3899, + "time_per_iteration": 2.5148820877075195 + }, + { + "auxiliary_loss_clip": 0.01149602, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.05162537, + "balance_loss_mlp": 1.02554631, + "epoch": 0.46894727349245474, + "flos": 17821640442240.0, + "grad_norm": 1.7038890059976342, + "language_loss": 0.82639122, + "learning_rate": 2.2972408749192917e-06, + "loss": 0.84822327, + "num_input_tokens_seen": 83887545, + "step": 3900, + "time_per_iteration": 2.454667568206787 + }, + { + "auxiliary_loss_clip": 0.01167148, + "auxiliary_loss_mlp": 0.00762295, + "balance_loss_clip": 1.05345964, + "balance_loss_mlp": 1.00015974, + "epoch": 0.46906751638309385, + "flos": 21471349559040.0, + "grad_norm": 1.9022616262999756, + "language_loss": 0.66984117, + "learning_rate": 2.296470531759861e-06, + "loss": 0.68913567, + "num_input_tokens_seen": 83905645, + "step": 3901, + "time_per_iteration": 2.5472047328948975 + }, + { + "auxiliary_loss_clip": 0.01134553, + "auxiliary_loss_mlp": 0.01027173, + "balance_loss_clip": 1.04632318, + "balance_loss_mlp": 1.01845884, + "epoch": 0.46918775927373296, + "flos": 20337829090560.0, + "grad_norm": 2.104826364359253, + "language_loss": 0.79586041, + "learning_rate": 2.2957001436260866e-06, + "loss": 0.81747758, + "num_input_tokens_seen": 83922705, + "step": 3902, + "time_per_iteration": 2.50502610206604 + }, + { + "auxiliary_loss_clip": 0.01149945, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.04967451, + "balance_loss_mlp": 1.02277637, + "epoch": 0.469308002164372, + "flos": 18403249461120.0, + "grad_norm": 1.6900208873205684, + "language_loss": 0.73074389, + "learning_rate": 2.294929710634836e-06, + "loss": 0.75255048, + "num_input_tokens_seen": 83940795, + "step": 3903, + "time_per_iteration": 2.4626681804656982 + }, + { + "auxiliary_loss_clip": 0.01163985, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.04815865, + "balance_loss_mlp": 1.02452862, + "epoch": 0.46942824505501113, + "flos": 37962067363200.0, + "grad_norm": 1.7988188607603042, + "language_loss": 0.6076479, + "learning_rate": 2.2941592329029823e-06, + "loss": 0.62961543, + "num_input_tokens_seen": 83961900, + "step": 3904, + "time_per_iteration": 2.6006581783294678 + }, + { + "auxiliary_loss_clip": 0.01164936, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.05158997, + "balance_loss_mlp": 1.02277911, + "epoch": 0.46954848794565024, + "flos": 21872507627520.0, + "grad_norm": 2.013836162482629, + "language_loss": 0.79043615, + "learning_rate": 2.2933887105474067e-06, + "loss": 0.81240427, + "num_input_tokens_seen": 83980075, + "step": 3905, + "time_per_iteration": 2.468151807785034 + }, + { + "auxiliary_loss_clip": 0.01165186, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.05300975, + "balance_loss_mlp": 1.02302909, + "epoch": 0.4696687308362893, + "flos": 22016545165440.0, + "grad_norm": 1.7684303629614024, + "language_loss": 0.8133927, + "learning_rate": 2.2926181436849974e-06, + "loss": 0.83535147, + "num_input_tokens_seen": 83999430, + "step": 3906, + "time_per_iteration": 2.4747769832611084 + }, + { + "auxiliary_loss_clip": 0.01166751, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.05305803, + "balance_loss_mlp": 1.02475095, + "epoch": 0.4697889737269284, + "flos": 21613663244160.0, + "grad_norm": 1.7270848221075503, + "language_loss": 0.72615135, + "learning_rate": 2.2918475324326478e-06, + "loss": 0.74815309, + "num_input_tokens_seen": 84019150, + "step": 3907, + "time_per_iteration": 2.4720046520233154 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.00762847, + "balance_loss_clip": 1.05482447, + "balance_loss_mlp": 1.00018561, + "epoch": 0.46990921661756746, + "flos": 25228323665280.0, + "grad_norm": 2.414298547109381, + "language_loss": 0.9137364, + "learning_rate": 2.2910768769072603e-06, + "loss": 0.93308932, + "num_input_tokens_seen": 84037930, + "step": 3908, + "time_per_iteration": 2.494840145111084 + }, + { + "auxiliary_loss_clip": 0.011628, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.05139899, + "balance_loss_mlp": 1.02514374, + "epoch": 0.47002945950820657, + "flos": 13844031045120.0, + "grad_norm": 1.8640808578300783, + "language_loss": 0.75583982, + "learning_rate": 2.2903061772257417e-06, + "loss": 0.77779669, + "num_input_tokens_seen": 84055915, + "step": 3909, + "time_per_iteration": 2.4327032566070557 + }, + { + "auxiliary_loss_clip": 0.01166474, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.05212092, + "balance_loss_mlp": 1.02214789, + "epoch": 0.4701497023988457, + "flos": 26247001374720.0, + "grad_norm": 1.4465889587670249, + "language_loss": 0.78676677, + "learning_rate": 2.289535433505007e-06, + "loss": 0.80873364, + "num_input_tokens_seen": 84077270, + "step": 3910, + "time_per_iteration": 3.34236478805542 + }, + { + "auxiliary_loss_clip": 0.01154239, + "auxiliary_loss_mlp": 0.01027731, + "balance_loss_clip": 1.04852319, + "balance_loss_mlp": 1.01959467, + "epoch": 0.47026994528948474, + "flos": 25629517647360.0, + "grad_norm": 3.3325974196098884, + "language_loss": 0.63712442, + "learning_rate": 2.2887646458619767e-06, + "loss": 0.65894413, + "num_input_tokens_seen": 84098635, + "step": 3911, + "time_per_iteration": 2.5121309757232666 + }, + { + "auxiliary_loss_clip": 0.0114579, + "auxiliary_loss_mlp": 0.01033416, + "balance_loss_clip": 1.04975271, + "balance_loss_mlp": 1.02493477, + "epoch": 0.47039018818012385, + "flos": 20554406144640.0, + "grad_norm": 1.8168180917927572, + "language_loss": 0.76332533, + "learning_rate": 2.2879938144135797e-06, + "loss": 0.78511739, + "num_input_tokens_seen": 84114740, + "step": 3912, + "time_per_iteration": 3.391508102416992 + }, + { + "auxiliary_loss_clip": 0.01137282, + "auxiliary_loss_mlp": 0.00762165, + "balance_loss_clip": 1.04772663, + "balance_loss_mlp": 1.0001483, + "epoch": 0.47051043107076296, + "flos": 21577249831680.0, + "grad_norm": 1.566237535722384, + "language_loss": 0.74709392, + "learning_rate": 2.2872229392767496e-06, + "loss": 0.76608837, + "num_input_tokens_seen": 84134845, + "step": 3913, + "time_per_iteration": 2.554697036743164 + }, + { + "auxiliary_loss_clip": 0.01173013, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.05492735, + "balance_loss_mlp": 1.02301705, + "epoch": 0.470630673961402, + "flos": 18953185662720.0, + "grad_norm": 1.6615889273849738, + "language_loss": 0.74864447, + "learning_rate": 2.286452020568428e-06, + "loss": 0.770679, + "num_input_tokens_seen": 84152920, + "step": 3914, + "time_per_iteration": 2.4670512676239014 + }, + { + "auxiliary_loss_clip": 0.01187811, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.05464709, + "balance_loss_mlp": 1.02084088, + "epoch": 0.4707509168520411, + "flos": 19938969492480.0, + "grad_norm": 1.7767280007631059, + "language_loss": 0.73110145, + "learning_rate": 2.2856810584055637e-06, + "loss": 0.75327832, + "num_input_tokens_seen": 84170455, + "step": 3915, + "time_per_iteration": 3.1501214504241943 + }, + { + "auxiliary_loss_clip": 0.01169901, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.05301285, + "balance_loss_mlp": 1.01920438, + "epoch": 0.47087115974268023, + "flos": 40118754741120.0, + "grad_norm": 1.4843282628754895, + "language_loss": 0.67796385, + "learning_rate": 2.2849100529051085e-06, + "loss": 0.69993281, + "num_input_tokens_seen": 84197390, + "step": 3916, + "time_per_iteration": 2.6483914852142334 + }, + { + "auxiliary_loss_clip": 0.01181751, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.05486059, + "balance_loss_mlp": 1.0206002, + "epoch": 0.4709914026333193, + "flos": 13552723745280.0, + "grad_norm": 2.6199059526797646, + "language_loss": 0.80272996, + "learning_rate": 2.284139004184026e-06, + "loss": 0.8248359, + "num_input_tokens_seen": 84214620, + "step": 3917, + "time_per_iteration": 3.078840732574463 + }, + { + "auxiliary_loss_clip": 0.01184876, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.05471694, + "balance_loss_mlp": 1.01772952, + "epoch": 0.4711116455239584, + "flos": 19974628719360.0, + "grad_norm": 1.9085208601447747, + "language_loss": 0.74340433, + "learning_rate": 2.2833679123592814e-06, + "loss": 0.76551229, + "num_input_tokens_seen": 84231880, + "step": 3918, + "time_per_iteration": 2.4177355766296387 + }, + { + "auxiliary_loss_clip": 0.01152112, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.05053329, + "balance_loss_mlp": 1.02140808, + "epoch": 0.4712318884145975, + "flos": 32124824064000.0, + "grad_norm": 1.8913302106108993, + "language_loss": 0.63578331, + "learning_rate": 2.2825967775478508e-06, + "loss": 0.65760362, + "num_input_tokens_seen": 84252980, + "step": 3919, + "time_per_iteration": 2.5593221187591553 + }, + { + "auxiliary_loss_clip": 0.01181308, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.05168343, + "balance_loss_mlp": 1.0218457, + "epoch": 0.47135213130523657, + "flos": 20047850593920.0, + "grad_norm": 1.9486405646148768, + "language_loss": 0.83488429, + "learning_rate": 2.2818255998667135e-06, + "loss": 0.85699725, + "num_input_tokens_seen": 84271490, + "step": 3920, + "time_per_iteration": 2.407737970352173 + }, + { + "auxiliary_loss_clip": 0.01168589, + "auxiliary_loss_mlp": 0.01023757, + "balance_loss_clip": 1.05431485, + "balance_loss_mlp": 1.01621699, + "epoch": 0.4714723741958757, + "flos": 19426990988160.0, + "grad_norm": 1.67337167104691, + "language_loss": 0.79162836, + "learning_rate": 2.2810543794328566e-06, + "loss": 0.81355184, + "num_input_tokens_seen": 84290525, + "step": 3921, + "time_per_iteration": 2.436403751373291 + }, + { + "auxiliary_loss_clip": 0.01171074, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.0525527, + "balance_loss_mlp": 1.02193356, + "epoch": 0.4715926170865148, + "flos": 20373883367040.0, + "grad_norm": 1.6731921078568863, + "language_loss": 0.82470763, + "learning_rate": 2.2802831163632735e-06, + "loss": 0.84671533, + "num_input_tokens_seen": 84309245, + "step": 3922, + "time_per_iteration": 2.5063672065734863 + }, + { + "auxiliary_loss_clip": 0.01114736, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.04751801, + "balance_loss_mlp": 1.0205543, + "epoch": 0.47171285997715384, + "flos": 22672884430080.0, + "grad_norm": 1.656064427367903, + "language_loss": 0.74663508, + "learning_rate": 2.279511810774965e-06, + "loss": 0.76807296, + "num_input_tokens_seen": 84330775, + "step": 3923, + "time_per_iteration": 2.5902345180511475 + }, + { + "auxiliary_loss_clip": 0.01184181, + "auxiliary_loss_mlp": 0.01027104, + "balance_loss_clip": 1.05463362, + "balance_loss_mlp": 1.01911116, + "epoch": 0.47183310286779295, + "flos": 21105419754240.0, + "grad_norm": 1.9027523324417548, + "language_loss": 0.71974021, + "learning_rate": 2.2787404627849364e-06, + "loss": 0.74185306, + "num_input_tokens_seen": 84349985, + "step": 3924, + "time_per_iteration": 2.416029691696167 + }, + { + "auxiliary_loss_clip": 0.01151481, + "auxiliary_loss_mlp": 0.01027267, + "balance_loss_clip": 1.04859531, + "balance_loss_mlp": 1.01920867, + "epoch": 0.471953345758432, + "flos": 21726566668800.0, + "grad_norm": 1.9920244565664502, + "language_loss": 0.79076815, + "learning_rate": 2.277969072510202e-06, + "loss": 0.81255561, + "num_input_tokens_seen": 84368965, + "step": 3925, + "time_per_iteration": 2.4915759563446045 + }, + { + "auxiliary_loss_clip": 0.01154154, + "auxiliary_loss_mlp": 0.01026082, + "balance_loss_clip": 1.05168402, + "balance_loss_mlp": 1.01827955, + "epoch": 0.4720735886490711, + "flos": 19861078849920.0, + "grad_norm": 1.6432953144551201, + "language_loss": 0.81646621, + "learning_rate": 2.2771976400677803e-06, + "loss": 0.83826864, + "num_input_tokens_seen": 84387795, + "step": 3926, + "time_per_iteration": 2.491424083709717 + }, + { + "auxiliary_loss_clip": 0.01115938, + "auxiliary_loss_mlp": 0.01024031, + "balance_loss_clip": 1.04512691, + "balance_loss_mlp": 1.0161097, + "epoch": 0.47219383153971023, + "flos": 19171809792000.0, + "grad_norm": 1.757102901117818, + "language_loss": 0.78484219, + "learning_rate": 2.2764261655746965e-06, + "loss": 0.80624187, + "num_input_tokens_seen": 84405290, + "step": 3927, + "time_per_iteration": 2.5622406005859375 + }, + { + "auxiliary_loss_clip": 0.01137155, + "auxiliary_loss_mlp": 0.01023608, + "balance_loss_clip": 1.04924273, + "balance_loss_mlp": 1.01570392, + "epoch": 0.4723140744303493, + "flos": 23224005780480.0, + "grad_norm": 1.6525497716001576, + "language_loss": 0.75936806, + "learning_rate": 2.2756546491479832e-06, + "loss": 0.78097564, + "num_input_tokens_seen": 84426205, + "step": 3928, + "time_per_iteration": 2.594552993774414 + }, + { + "auxiliary_loss_clip": 0.01183145, + "auxiliary_loss_mlp": 0.00762574, + "balance_loss_clip": 1.05210543, + "balance_loss_mlp": 1.00017548, + "epoch": 0.4724343173209884, + "flos": 18223265387520.0, + "grad_norm": 4.575438564586592, + "language_loss": 0.80469501, + "learning_rate": 2.274883090904679e-06, + "loss": 0.82415217, + "num_input_tokens_seen": 84443970, + "step": 3929, + "time_per_iteration": 2.482083320617676 + }, + { + "auxiliary_loss_clip": 0.01185622, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.05644011, + "balance_loss_mlp": 1.02007627, + "epoch": 0.4725545602116275, + "flos": 21251037490560.0, + "grad_norm": 2.3492833442457774, + "language_loss": 0.68133175, + "learning_rate": 2.2741114909618283e-06, + "loss": 0.70346868, + "num_input_tokens_seen": 84459865, + "step": 3930, + "time_per_iteration": 2.428098678588867 + }, + { + "auxiliary_loss_clip": 0.01141188, + "auxiliary_loss_mlp": 0.01024984, + "balance_loss_clip": 1.04971039, + "balance_loss_mlp": 1.01655602, + "epoch": 0.47267480310226656, + "flos": 21434002392960.0, + "grad_norm": 1.9010653593579911, + "language_loss": 0.71740603, + "learning_rate": 2.2733398494364828e-06, + "loss": 0.73906779, + "num_input_tokens_seen": 84479110, + "step": 3931, + "time_per_iteration": 2.6437652111053467 + }, + { + "auxiliary_loss_clip": 0.01151119, + "auxiliary_loss_mlp": 0.01027011, + "balance_loss_clip": 1.05338371, + "balance_loss_mlp": 1.01914883, + "epoch": 0.47279504599290567, + "flos": 18770508069120.0, + "grad_norm": 1.8301577529125357, + "language_loss": 0.84467971, + "learning_rate": 2.272568166445699e-06, + "loss": 0.86646104, + "num_input_tokens_seen": 84497675, + "step": 3932, + "time_per_iteration": 2.4781692028045654 + }, + { + "auxiliary_loss_clip": 0.01167848, + "auxiliary_loss_mlp": 0.01021889, + "balance_loss_clip": 1.05021477, + "balance_loss_mlp": 1.01361609, + "epoch": 0.4729152888835448, + "flos": 21105742976640.0, + "grad_norm": 1.9076541834854692, + "language_loss": 0.64131486, + "learning_rate": 2.271796442106541e-06, + "loss": 0.6632123, + "num_input_tokens_seen": 84517030, + "step": 3933, + "time_per_iteration": 2.4738810062408447 + }, + { + "auxiliary_loss_clip": 0.01048427, + "auxiliary_loss_mlp": 0.01004574, + "balance_loss_clip": 1.01725721, + "balance_loss_mlp": 1.00341153, + "epoch": 0.47303553177418384, + "flos": 70201877840640.0, + "grad_norm": 0.805979241650793, + "language_loss": 0.56495887, + "learning_rate": 2.271024676536079e-06, + "loss": 0.58548892, + "num_input_tokens_seen": 84577290, + "step": 3934, + "time_per_iteration": 3.096956491470337 + }, + { + "auxiliary_loss_clip": 0.01161425, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.05564463, + "balance_loss_mlp": 1.02038836, + "epoch": 0.47315577466482295, + "flos": 22455122227200.0, + "grad_norm": 2.1328800265111583, + "language_loss": 0.73396826, + "learning_rate": 2.2702528698513894e-06, + "loss": 0.7558735, + "num_input_tokens_seen": 84598415, + "step": 3935, + "time_per_iteration": 2.5184085369110107 + }, + { + "auxiliary_loss_clip": 0.01153546, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.0472182, + "balance_loss_mlp": 1.0199579, + "epoch": 0.47327601755546206, + "flos": 24352857480960.0, + "grad_norm": 2.103317361248548, + "language_loss": 0.78858531, + "learning_rate": 2.269481022169554e-06, + "loss": 0.81040192, + "num_input_tokens_seen": 84617010, + "step": 3936, + "time_per_iteration": 2.5378787517547607 + }, + { + "auxiliary_loss_clip": 0.01159156, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.04911327, + "balance_loss_mlp": 1.01791883, + "epoch": 0.4733962604461011, + "flos": 22926772736640.0, + "grad_norm": 1.537479449737051, + "language_loss": 0.80446088, + "learning_rate": 2.2687091336076614e-06, + "loss": 0.82631803, + "num_input_tokens_seen": 84636350, + "step": 3937, + "time_per_iteration": 3.367605686187744 + }, + { + "auxiliary_loss_clip": 0.01167021, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.05243397, + "balance_loss_mlp": 1.0260694, + "epoch": 0.4735165033367402, + "flos": 18327369980160.0, + "grad_norm": 1.7593878940941947, + "language_loss": 0.79858375, + "learning_rate": 2.267937204282807e-06, + "loss": 0.82059515, + "num_input_tokens_seen": 84653490, + "step": 3938, + "time_per_iteration": 2.447352886199951 + }, + { + "auxiliary_loss_clip": 0.01175472, + "auxiliary_loss_mlp": 0.01026577, + "balance_loss_clip": 1.05418169, + "balance_loss_mlp": 1.01808929, + "epoch": 0.4736367462273793, + "flos": 23037018554880.0, + "grad_norm": 1.8830531469526242, + "language_loss": 0.79087877, + "learning_rate": 2.2671652343120926e-06, + "loss": 0.81289923, + "num_input_tokens_seen": 84673965, + "step": 3939, + "time_per_iteration": 3.3205392360687256 + }, + { + "auxiliary_loss_clip": 0.01182226, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.05430079, + "balance_loss_mlp": 1.01934767, + "epoch": 0.4737569891180184, + "flos": 25374336451200.0, + "grad_norm": 1.695863902800287, + "language_loss": 0.80493605, + "learning_rate": 2.2663932238126236e-06, + "loss": 0.82702792, + "num_input_tokens_seen": 84692525, + "step": 3940, + "time_per_iteration": 2.4844329357147217 + }, + { + "auxiliary_loss_clip": 0.01168258, + "auxiliary_loss_mlp": 0.01024126, + "balance_loss_clip": 1.05049801, + "balance_loss_mlp": 1.01565027, + "epoch": 0.4738772320086575, + "flos": 25849326925440.0, + "grad_norm": 1.4025689413624673, + "language_loss": 0.80538106, + "learning_rate": 2.265621172901515e-06, + "loss": 0.82730484, + "num_input_tokens_seen": 84715640, + "step": 3941, + "time_per_iteration": 3.304386615753174 + }, + { + "auxiliary_loss_clip": 0.01186283, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.05710828, + "balance_loss_mlp": 1.02434611, + "epoch": 0.47399747489929656, + "flos": 27564420499200.0, + "grad_norm": 2.213206429423058, + "language_loss": 0.71631354, + "learning_rate": 2.2648490816958854e-06, + "loss": 0.73850167, + "num_input_tokens_seen": 84736635, + "step": 3942, + "time_per_iteration": 2.50063157081604 + }, + { + "auxiliary_loss_clip": 0.01168264, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.05015123, + "balance_loss_mlp": 1.02203131, + "epoch": 0.47411771778993567, + "flos": 24863650836480.0, + "grad_norm": 2.296572266514965, + "language_loss": 0.73123455, + "learning_rate": 2.264076950312861e-06, + "loss": 0.75322914, + "num_input_tokens_seen": 84755445, + "step": 3943, + "time_per_iteration": 2.4793343544006348 + }, + { + "auxiliary_loss_clip": 0.01159549, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.05114079, + "balance_loss_mlp": 1.02481544, + "epoch": 0.4742379606805748, + "flos": 22748009725440.0, + "grad_norm": 1.9875040261026784, + "language_loss": 0.82636118, + "learning_rate": 2.2633047788695727e-06, + "loss": 0.84829378, + "num_input_tokens_seen": 84775750, + "step": 3944, + "time_per_iteration": 3.277961254119873 + }, + { + "auxiliary_loss_clip": 0.01153429, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.05159068, + "balance_loss_mlp": 1.02140808, + "epoch": 0.47435820357121383, + "flos": 19681130689920.0, + "grad_norm": 1.8106678933938967, + "language_loss": 0.64074922, + "learning_rate": 2.262532567483159e-06, + "loss": 0.66257155, + "num_input_tokens_seen": 84794310, + "step": 3945, + "time_per_iteration": 2.4848594665527344 + }, + { + "auxiliary_loss_clip": 0.01187705, + "auxiliary_loss_mlp": 0.00762947, + "balance_loss_clip": 1.05689502, + "balance_loss_mlp": 1.00022018, + "epoch": 0.47447844646185294, + "flos": 25228718714880.0, + "grad_norm": 1.953219185358619, + "language_loss": 0.8039096, + "learning_rate": 2.2617603162707635e-06, + "loss": 0.82341611, + "num_input_tokens_seen": 84814720, + "step": 3946, + "time_per_iteration": 2.459381341934204 + }, + { + "auxiliary_loss_clip": 0.01182574, + "auxiliary_loss_mlp": 0.0102756, + "balance_loss_clip": 1.05416536, + "balance_loss_mlp": 1.01962066, + "epoch": 0.47459868935249205, + "flos": 24570619683840.0, + "grad_norm": 1.7823515575382127, + "language_loss": 0.82610595, + "learning_rate": 2.2609880253495363e-06, + "loss": 0.84820735, + "num_input_tokens_seen": 84834355, + "step": 3947, + "time_per_iteration": 2.469965696334839 + }, + { + "auxiliary_loss_clip": 0.01149464, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.04830933, + "balance_loss_mlp": 1.02450728, + "epoch": 0.4747189322431311, + "flos": 20558500295040.0, + "grad_norm": 1.9899514402168406, + "language_loss": 0.86516255, + "learning_rate": 2.260215694836633e-06, + "loss": 0.88698292, + "num_input_tokens_seen": 84853530, + "step": 3948, + "time_per_iteration": 2.5163052082061768 + }, + { + "auxiliary_loss_clip": 0.01127495, + "auxiliary_loss_mlp": 0.00762548, + "balance_loss_clip": 1.04538691, + "balance_loss_mlp": 1.00017416, + "epoch": 0.4748391751337702, + "flos": 25995231970560.0, + "grad_norm": 2.1232131002539476, + "language_loss": 0.65029842, + "learning_rate": 2.2594433248492157e-06, + "loss": 0.66919887, + "num_input_tokens_seen": 84872505, + "step": 3949, + "time_per_iteration": 2.6297905445098877 + }, + { + "auxiliary_loss_clip": 0.01172745, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.05143785, + "balance_loss_mlp": 1.02367806, + "epoch": 0.47495941802440933, + "flos": 22821052032000.0, + "grad_norm": 1.7018345763394311, + "language_loss": 0.80039042, + "learning_rate": 2.2586709155044527e-06, + "loss": 0.82243013, + "num_input_tokens_seen": 84893105, + "step": 3950, + "time_per_iteration": 2.485506057739258 + }, + { + "auxiliary_loss_clip": 0.01184137, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.05480194, + "balance_loss_mlp": 1.01877904, + "epoch": 0.4750796609150484, + "flos": 27891782075520.0, + "grad_norm": 1.535165517223909, + "language_loss": 0.75993085, + "learning_rate": 2.2578984669195167e-06, + "loss": 0.78204578, + "num_input_tokens_seen": 84914070, + "step": 3951, + "time_per_iteration": 2.4928951263427734 + }, + { + "auxiliary_loss_clip": 0.01164282, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.04787517, + "balance_loss_mlp": 1.01997006, + "epoch": 0.4751999038056875, + "flos": 35660085471360.0, + "grad_norm": 1.7078249981280178, + "language_loss": 0.68011773, + "learning_rate": 2.2571259792115887e-06, + "loss": 0.7020368, + "num_input_tokens_seen": 84935290, + "step": 3952, + "time_per_iteration": 2.6143741607666016 + }, + { + "auxiliary_loss_clip": 0.01162092, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.04994595, + "balance_loss_mlp": 1.02075469, + "epoch": 0.4753201466963266, + "flos": 22090880361600.0, + "grad_norm": 2.010348380700964, + "language_loss": 0.79343057, + "learning_rate": 2.2563534524978544e-06, + "loss": 0.81533134, + "num_input_tokens_seen": 84952760, + "step": 3953, + "time_per_iteration": 2.4709224700927734 + }, + { + "auxiliary_loss_clip": 0.0113746, + "auxiliary_loss_mlp": 0.01025891, + "balance_loss_clip": 1.05321193, + "balance_loss_mlp": 1.01849985, + "epoch": 0.47544038958696566, + "flos": 30190854965760.0, + "grad_norm": 3.4154588355327014, + "language_loss": 0.70507705, + "learning_rate": 2.2555808868955052e-06, + "loss": 0.72671056, + "num_input_tokens_seen": 84974890, + "step": 3954, + "time_per_iteration": 2.616994619369507 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.04717135, + "balance_loss_mlp": 1.01856577, + "epoch": 0.47556063247760477, + "flos": 23472219738240.0, + "grad_norm": 2.242850593187867, + "language_loss": 0.73732722, + "learning_rate": 2.254808282521738e-06, + "loss": 0.7588585, + "num_input_tokens_seen": 84993640, + "step": 3955, + "time_per_iteration": 2.5972847938537598 + }, + { + "auxiliary_loss_clip": 0.01142271, + "auxiliary_loss_mlp": 0.00762986, + "balance_loss_clip": 1.04797673, + "balance_loss_mlp": 1.00015473, + "epoch": 0.4756808753682438, + "flos": 25155209531520.0, + "grad_norm": 1.688456891365973, + "language_loss": 0.80921721, + "learning_rate": 2.2540356394937573e-06, + "loss": 0.82826972, + "num_input_tokens_seen": 85012340, + "step": 3956, + "time_per_iteration": 2.560209274291992 + }, + { + "auxiliary_loss_clip": 0.01145335, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.04867911, + "balance_loss_mlp": 1.01842046, + "epoch": 0.47580111825888294, + "flos": 15669729573120.0, + "grad_norm": 2.5840713815426923, + "language_loss": 0.83982933, + "learning_rate": 2.253262957928772e-06, + "loss": 0.86155081, + "num_input_tokens_seen": 85029225, + "step": 3957, + "time_per_iteration": 2.498357057571411 + }, + { + "auxiliary_loss_clip": 0.01146383, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.0465858, + "balance_loss_mlp": 1.01910937, + "epoch": 0.47592136114952205, + "flos": 17636556637440.0, + "grad_norm": 1.6481076950228488, + "language_loss": 0.71979707, + "learning_rate": 2.2524902379439976e-06, + "loss": 0.74153352, + "num_input_tokens_seen": 85047895, + "step": 3958, + "time_per_iteration": 2.4685184955596924 + }, + { + "auxiliary_loss_clip": 0.01036937, + "auxiliary_loss_mlp": 0.01015124, + "balance_loss_clip": 1.03191209, + "balance_loss_mlp": 1.01313341, + "epoch": 0.4760416040401611, + "flos": 61417159292160.0, + "grad_norm": 0.7416526265400596, + "language_loss": 0.63709635, + "learning_rate": 2.251717479656655e-06, + "loss": 0.65761691, + "num_input_tokens_seen": 85112690, + "step": 3959, + "time_per_iteration": 3.199294328689575 + }, + { + "auxiliary_loss_clip": 0.01183416, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.05317748, + "balance_loss_mlp": 1.02187753, + "epoch": 0.4761618469308002, + "flos": 18405871153920.0, + "grad_norm": 2.5720710813834837, + "language_loss": 0.76329547, + "learning_rate": 2.2509446831839704e-06, + "loss": 0.78543437, + "num_input_tokens_seen": 85132130, + "step": 3960, + "time_per_iteration": 2.4176218509674072 + }, + { + "auxiliary_loss_clip": 0.01156009, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.04874122, + "balance_loss_mlp": 1.02462065, + "epoch": 0.4762820898214393, + "flos": 18040911016320.0, + "grad_norm": 2.3312981433200153, + "language_loss": 0.82654268, + "learning_rate": 2.250171848643177e-06, + "loss": 0.84843439, + "num_input_tokens_seen": 85149420, + "step": 3961, + "time_per_iteration": 2.463630199432373 + }, + { + "auxiliary_loss_clip": 0.01150583, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.05071545, + "balance_loss_mlp": 1.02666855, + "epoch": 0.4764023327120784, + "flos": 19318253541120.0, + "grad_norm": 1.7757762507366792, + "language_loss": 0.86164963, + "learning_rate": 2.249398976151513e-06, + "loss": 0.88349473, + "num_input_tokens_seen": 85166970, + "step": 3962, + "time_per_iteration": 2.5673716068267822 + }, + { + "auxiliary_loss_clip": 0.0118074, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.05287385, + "balance_loss_mlp": 1.02404189, + "epoch": 0.4765225756027175, + "flos": 22747255539840.0, + "grad_norm": 2.269308122831583, + "language_loss": 0.78704858, + "learning_rate": 2.248626065826223e-06, + "loss": 0.80917341, + "num_input_tokens_seen": 85185175, + "step": 3963, + "time_per_iteration": 3.270092248916626 + }, + { + "auxiliary_loss_clip": 0.01085144, + "auxiliary_loss_mlp": 0.01003451, + "balance_loss_clip": 1.02123535, + "balance_loss_mlp": 1.00227058, + "epoch": 0.4766428184933566, + "flos": 65933392106880.0, + "grad_norm": 0.7628748756633362, + "language_loss": 0.62553471, + "learning_rate": 2.2478531177845564e-06, + "loss": 0.64642066, + "num_input_tokens_seen": 85246170, + "step": 3964, + "time_per_iteration": 2.9979188442230225 + }, + { + "auxiliary_loss_clip": 0.01154793, + "auxiliary_loss_mlp": 0.01023169, + "balance_loss_clip": 1.04986763, + "balance_loss_mlp": 1.01571274, + "epoch": 0.47676306138399566, + "flos": 24136495908480.0, + "grad_norm": 2.2676695959364475, + "language_loss": 0.84941137, + "learning_rate": 2.247080132143769e-06, + "loss": 0.87119102, + "num_input_tokens_seen": 85268525, + "step": 3965, + "time_per_iteration": 2.5312819480895996 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.04306388, + "balance_loss_mlp": 1.0206604, + "epoch": 0.47688330427463477, + "flos": 12604322995200.0, + "grad_norm": 1.946599524660357, + "language_loss": 0.69220626, + "learning_rate": 2.246307109021121e-06, + "loss": 0.71387088, + "num_input_tokens_seen": 85285930, + "step": 3966, + "time_per_iteration": 3.349525213241577 + }, + { + "auxiliary_loss_clip": 0.01150135, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.04738307, + "balance_loss_mlp": 1.02122211, + "epoch": 0.4770035471652739, + "flos": 21390585828480.0, + "grad_norm": 1.8210990834071763, + "language_loss": 0.82020861, + "learning_rate": 2.2455340485338817e-06, + "loss": 0.8420037, + "num_input_tokens_seen": 85303565, + "step": 3967, + "time_per_iteration": 2.5074305534362793 + }, + { + "auxiliary_loss_clip": 0.01167821, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.05049324, + "balance_loss_mlp": 1.019449, + "epoch": 0.47712379005591293, + "flos": 25156251025920.0, + "grad_norm": 2.2546258739980893, + "language_loss": 0.67999876, + "learning_rate": 2.244760950799322e-06, + "loss": 0.70195192, + "num_input_tokens_seen": 85321835, + "step": 3968, + "time_per_iteration": 3.3435447216033936 + }, + { + "auxiliary_loss_clip": 0.01124292, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.04623389, + "balance_loss_mlp": 1.02103591, + "epoch": 0.47724403294655204, + "flos": 22054323294720.0, + "grad_norm": 1.7402937587528589, + "language_loss": 0.72403592, + "learning_rate": 2.2439878159347203e-06, + "loss": 0.74557066, + "num_input_tokens_seen": 85341260, + "step": 3969, + "time_per_iteration": 2.5409762859344482 + }, + { + "auxiliary_loss_clip": 0.01084001, + "auxiliary_loss_mlp": 0.01001324, + "balance_loss_clip": 1.01954353, + "balance_loss_mlp": 1.00013757, + "epoch": 0.4773642758371911, + "flos": 70229387658240.0, + "grad_norm": 0.7336429076439863, + "language_loss": 0.55572265, + "learning_rate": 2.2432146440573616e-06, + "loss": 0.57657593, + "num_input_tokens_seen": 85407220, + "step": 3970, + "time_per_iteration": 3.126549243927002 + }, + { + "auxiliary_loss_clip": 0.0115302, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.05038774, + "balance_loss_mlp": 1.01932216, + "epoch": 0.4774845187278302, + "flos": 23548602009600.0, + "grad_norm": 1.8888335085994998, + "language_loss": 0.66403818, + "learning_rate": 2.242441435284534e-06, + "loss": 0.68584049, + "num_input_tokens_seen": 85426095, + "step": 3971, + "time_per_iteration": 3.262711524963379 + }, + { + "auxiliary_loss_clip": 0.01171728, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.05407298, + "balance_loss_mlp": 1.02297819, + "epoch": 0.4776047616184693, + "flos": 23075371301760.0, + "grad_norm": 2.1976847190672597, + "language_loss": 0.85757387, + "learning_rate": 2.2416681897335337e-06, + "loss": 0.87961185, + "num_input_tokens_seen": 85444245, + "step": 3972, + "time_per_iteration": 2.455953598022461 + }, + { + "auxiliary_loss_clip": 0.0112803, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.05024052, + "balance_loss_mlp": 1.02555537, + "epoch": 0.4777250045091084, + "flos": 31898119374720.0, + "grad_norm": 1.776389658067525, + "language_loss": 0.67022085, + "learning_rate": 2.240894907521661e-06, + "loss": 0.69184434, + "num_input_tokens_seen": 85463325, + "step": 3973, + "time_per_iteration": 2.6557347774505615 + }, + { + "auxiliary_loss_clip": 0.01151655, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.04840493, + "balance_loss_mlp": 1.01963377, + "epoch": 0.4778452473997475, + "flos": 24278163148800.0, + "grad_norm": 1.761736449021304, + "language_loss": 0.64108646, + "learning_rate": 2.240121588766223e-06, + "loss": 0.66287845, + "num_input_tokens_seen": 85483375, + "step": 3974, + "time_per_iteration": 2.5389761924743652 + }, + { + "auxiliary_loss_clip": 0.01146781, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.04763746, + "balance_loss_mlp": 1.01957822, + "epoch": 0.4779654902903866, + "flos": 31575031516800.0, + "grad_norm": 1.632525483683521, + "language_loss": 0.71173179, + "learning_rate": 2.239348233584531e-06, + "loss": 0.73347658, + "num_input_tokens_seen": 85504230, + "step": 3975, + "time_per_iteration": 2.5691754817962646 + }, + { + "auxiliary_loss_clip": 0.01168319, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.05092502, + "balance_loss_mlp": 1.02532268, + "epoch": 0.47808573318102565, + "flos": 19500428344320.0, + "grad_norm": 1.9025432427835756, + "language_loss": 0.81269556, + "learning_rate": 2.2385748420939013e-06, + "loss": 0.83471906, + "num_input_tokens_seen": 85523425, + "step": 3976, + "time_per_iteration": 2.4381492137908936 + }, + { + "auxiliary_loss_clip": 0.0118344, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.05749369, + "balance_loss_mlp": 1.02167475, + "epoch": 0.47820597607166476, + "flos": 22601135013120.0, + "grad_norm": 1.8277971858326207, + "language_loss": 0.7250663, + "learning_rate": 2.2378014144116583e-06, + "loss": 0.74719793, + "num_input_tokens_seen": 85542235, + "step": 3977, + "time_per_iteration": 2.418173313140869 + }, + { + "auxiliary_loss_clip": 0.01186584, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.0548079, + "balance_loss_mlp": 1.02673841, + "epoch": 0.4783262189623039, + "flos": 23003011353600.0, + "grad_norm": 1.858051524971096, + "language_loss": 0.79761928, + "learning_rate": 2.23702795065513e-06, + "loss": 0.81982934, + "num_input_tokens_seen": 85561815, + "step": 3978, + "time_per_iteration": 2.4411394596099854 + }, + { + "auxiliary_loss_clip": 0.01074899, + "auxiliary_loss_mlp": 0.01001818, + "balance_loss_clip": 1.02020633, + "balance_loss_mlp": 1.00062561, + "epoch": 0.47844646185294293, + "flos": 49772801226240.0, + "grad_norm": 0.9939924251529546, + "language_loss": 0.67503238, + "learning_rate": 2.2362544509416493e-06, + "loss": 0.69579953, + "num_input_tokens_seen": 85613930, + "step": 3979, + "time_per_iteration": 2.9299569129943848 + }, + { + "auxiliary_loss_clip": 0.01144554, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.04686737, + "balance_loss_mlp": 1.02612305, + "epoch": 0.47856670474358204, + "flos": 20229558520320.0, + "grad_norm": 2.1673715058317016, + "language_loss": 0.82476485, + "learning_rate": 2.2354809153885572e-06, + "loss": 0.84655285, + "num_input_tokens_seen": 85631000, + "step": 3980, + "time_per_iteration": 2.48453688621521 + }, + { + "auxiliary_loss_clip": 0.01166358, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.04986358, + "balance_loss_mlp": 1.02284467, + "epoch": 0.47868694763422115, + "flos": 20990936131200.0, + "grad_norm": 1.6820024842968728, + "language_loss": 0.83080387, + "learning_rate": 2.234707344113197e-06, + "loss": 0.85277867, + "num_input_tokens_seen": 85649095, + "step": 3981, + "time_per_iteration": 2.435882329940796 + }, + { + "auxiliary_loss_clip": 0.01177533, + "auxiliary_loss_mlp": 0.01025285, + "balance_loss_clip": 1.05177999, + "balance_loss_mlp": 1.01770902, + "epoch": 0.4788071905248602, + "flos": 19026551191680.0, + "grad_norm": 2.2898127098641003, + "language_loss": 0.77545059, + "learning_rate": 2.233933737232919e-06, + "loss": 0.7974788, + "num_input_tokens_seen": 85666875, + "step": 3982, + "time_per_iteration": 2.4090487957000732 + }, + { + "auxiliary_loss_clip": 0.01116574, + "auxiliary_loss_mlp": 0.00762428, + "balance_loss_clip": 1.04493928, + "balance_loss_mlp": 1.00005317, + "epoch": 0.4789274334154993, + "flos": 23002221254400.0, + "grad_norm": 1.8953805438065205, + "language_loss": 0.78105259, + "learning_rate": 2.2331600948650793e-06, + "loss": 0.7998426, + "num_input_tokens_seen": 85687020, + "step": 3983, + "time_per_iteration": 2.545619487762451 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.00763618, + "balance_loss_clip": 1.04945779, + "balance_loss_mlp": 1.00006223, + "epoch": 0.4790476763061384, + "flos": 23075586783360.0, + "grad_norm": 1.4386954195740431, + "language_loss": 0.80384129, + "learning_rate": 2.2323864171270386e-06, + "loss": 0.82277459, + "num_input_tokens_seen": 85708290, + "step": 3984, + "time_per_iteration": 2.549447536468506 + }, + { + "auxiliary_loss_clip": 0.01141761, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.04670978, + "balance_loss_mlp": 1.0204339, + "epoch": 0.4791679191967775, + "flos": 21179288073600.0, + "grad_norm": 1.7739503564224635, + "language_loss": 0.72467792, + "learning_rate": 2.231612704136164e-06, + "loss": 0.74638855, + "num_input_tokens_seen": 85728660, + "step": 3985, + "time_per_iteration": 2.5680487155914307 + }, + { + "auxiliary_loss_clip": 0.01163455, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.04987931, + "balance_loss_mlp": 1.02105272, + "epoch": 0.4792881620874166, + "flos": 22301495758080.0, + "grad_norm": 2.430686904060973, + "language_loss": 0.74837899, + "learning_rate": 2.2308389560098253e-06, + "loss": 0.77030802, + "num_input_tokens_seen": 85745035, + "step": 3986, + "time_per_iteration": 2.4403202533721924 + }, + { + "auxiliary_loss_clip": 0.01146213, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.05261624, + "balance_loss_mlp": 1.01976955, + "epoch": 0.47940840497805565, + "flos": 17420877423360.0, + "grad_norm": 3.8204621774113923, + "language_loss": 0.77177101, + "learning_rate": 2.2300651728654008e-06, + "loss": 0.79351878, + "num_input_tokens_seen": 85760295, + "step": 3987, + "time_per_iteration": 2.511549949645996 + }, + { + "auxiliary_loss_clip": 0.01065503, + "auxiliary_loss_mlp": 0.00753057, + "balance_loss_clip": 1.0169965, + "balance_loss_mlp": 0.9999547, + "epoch": 0.47952864786869476, + "flos": 65358175708800.0, + "grad_norm": 0.7327198027798575, + "language_loss": 0.60208541, + "learning_rate": 2.229291354820272e-06, + "loss": 0.62027103, + "num_input_tokens_seen": 85821305, + "step": 3988, + "time_per_iteration": 3.0782713890075684 + }, + { + "auxiliary_loss_clip": 0.01164959, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.04941332, + "balance_loss_mlp": 1.02195215, + "epoch": 0.47964889075933387, + "flos": 16799802336000.0, + "grad_norm": 1.9265582463770996, + "language_loss": 0.76056862, + "learning_rate": 2.228517501991828e-06, + "loss": 0.78252256, + "num_input_tokens_seen": 85840105, + "step": 3989, + "time_per_iteration": 2.4439468383789062 + }, + { + "auxiliary_loss_clip": 0.01057014, + "auxiliary_loss_mlp": 0.01003261, + "balance_loss_clip": 1.01804996, + "balance_loss_mlp": 1.0021286, + "epoch": 0.4797691336499729, + "flos": 70079244808320.0, + "grad_norm": 0.8269018489069816, + "language_loss": 0.6105842, + "learning_rate": 2.22774361449746e-06, + "loss": 0.6311869, + "num_input_tokens_seen": 85896585, + "step": 3990, + "time_per_iteration": 3.928361654281616 + }, + { + "auxiliary_loss_clip": 0.0111311, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.04853439, + "balance_loss_mlp": 1.01973951, + "epoch": 0.47988937654061203, + "flos": 18953329317120.0, + "grad_norm": 2.865706420917079, + "language_loss": 0.70823878, + "learning_rate": 2.2269696924545668e-06, + "loss": 0.72964853, + "num_input_tokens_seen": 85914415, + "step": 3991, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01142927, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.05286992, + "balance_loss_mlp": 1.02127528, + "epoch": 0.48000961943125114, + "flos": 14461981649280.0, + "grad_norm": 2.3733513305385134, + "language_loss": 0.78378046, + "learning_rate": 2.2261957359805523e-06, + "loss": 0.80550164, + "num_input_tokens_seen": 85931650, + "step": 3992, + "time_per_iteration": 2.5114989280700684 + }, + { + "auxiliary_loss_clip": 0.01182526, + "auxiliary_loss_mlp": 0.01023449, + "balance_loss_clip": 1.05332077, + "balance_loss_mlp": 1.01557255, + "epoch": 0.4801298623218902, + "flos": 27051149105280.0, + "grad_norm": 2.7940978832993286, + "language_loss": 0.74187064, + "learning_rate": 2.225421745192823e-06, + "loss": 0.76393032, + "num_input_tokens_seen": 85951805, + "step": 3993, + "time_per_iteration": 3.326186180114746 + }, + { + "auxiliary_loss_clip": 0.01166701, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.05206537, + "balance_loss_mlp": 1.02068472, + "epoch": 0.4802501052125293, + "flos": 26355236031360.0, + "grad_norm": 2.1713753448950857, + "language_loss": 0.78271371, + "learning_rate": 2.2246477202087955e-06, + "loss": 0.80467105, + "num_input_tokens_seen": 85972485, + "step": 3994, + "time_per_iteration": 2.505716562271118 + }, + { + "auxiliary_loss_clip": 0.01155824, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.05045068, + "balance_loss_mlp": 1.02380204, + "epoch": 0.4803703481031684, + "flos": 20993916960000.0, + "grad_norm": 1.5563073175802402, + "language_loss": 0.8277812, + "learning_rate": 2.223873661145887e-06, + "loss": 0.84965098, + "num_input_tokens_seen": 85992540, + "step": 3995, + "time_per_iteration": 3.3509137630462646 + }, + { + "auxiliary_loss_clip": 0.01155156, + "auxiliary_loss_mlp": 0.00762901, + "balance_loss_clip": 1.05538583, + "balance_loss_mlp": 1.00001454, + "epoch": 0.4804905909938075, + "flos": 20703722981760.0, + "grad_norm": 1.5493262984513207, + "language_loss": 0.71310496, + "learning_rate": 2.2230995681215226e-06, + "loss": 0.73228556, + "num_input_tokens_seen": 86012065, + "step": 3996, + "time_per_iteration": 2.5059707164764404 + }, + { + "auxiliary_loss_clip": 0.01138284, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.05118155, + "balance_loss_mlp": 1.01886284, + "epoch": 0.4806108338844466, + "flos": 16654831044480.0, + "grad_norm": 2.3737159898476636, + "language_loss": 0.77964944, + "learning_rate": 2.2223254412531305e-06, + "loss": 0.80129671, + "num_input_tokens_seen": 86029435, + "step": 3997, + "time_per_iteration": 3.2671427726745605 + }, + { + "auxiliary_loss_clip": 0.01138616, + "auxiliary_loss_mlp": 0.01025982, + "balance_loss_clip": 1.04543066, + "balance_loss_mlp": 1.01820016, + "epoch": 0.4807310767750857, + "flos": 20011329440640.0, + "grad_norm": 1.8537446652298992, + "language_loss": 0.82521695, + "learning_rate": 2.221551280658146e-06, + "loss": 0.84686291, + "num_input_tokens_seen": 86048495, + "step": 3998, + "time_per_iteration": 2.47847056388855 + }, + { + "auxiliary_loss_clip": 0.01122415, + "auxiliary_loss_mlp": 0.01025565, + "balance_loss_clip": 1.0481379, + "balance_loss_mlp": 1.01785791, + "epoch": 0.48085131966572475, + "flos": 23185257984000.0, + "grad_norm": 1.6348183843005455, + "language_loss": 0.7434231, + "learning_rate": 2.2207770864540085e-06, + "loss": 0.76490295, + "num_input_tokens_seen": 86067470, + "step": 3999, + "time_per_iteration": 2.564797878265381 + }, + { + "auxiliary_loss_clip": 0.01146218, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.0497179, + "balance_loss_mlp": 1.02116036, + "epoch": 0.48097156255636386, + "flos": 20558643949440.0, + "grad_norm": 2.026055341939582, + "language_loss": 0.73038745, + "learning_rate": 2.220002858758162e-06, + "loss": 0.75214314, + "num_input_tokens_seen": 86085460, + "step": 4000, + "time_per_iteration": 2.4935972690582275 + }, + { + "auxiliary_loss_clip": 0.01071435, + "auxiliary_loss_mlp": 0.0100087, + "balance_loss_clip": 1.01699233, + "balance_loss_mlp": 0.99974364, + "epoch": 0.481091805447003, + "flos": 70511608817280.0, + "grad_norm": 0.9024897626198788, + "language_loss": 0.60884935, + "learning_rate": 2.2192285976880573e-06, + "loss": 0.62957239, + "num_input_tokens_seen": 86149715, + "step": 4001, + "time_per_iteration": 3.0582528114318848 + }, + { + "auxiliary_loss_clip": 0.01143968, + "auxiliary_loss_mlp": 0.00762087, + "balance_loss_clip": 1.04902232, + "balance_loss_mlp": 1.00003994, + "epoch": 0.48121204833764203, + "flos": 36428214839040.0, + "grad_norm": 1.5436701532378354, + "language_loss": 0.80716383, + "learning_rate": 2.2184543033611485e-06, + "loss": 0.82622439, + "num_input_tokens_seen": 86170795, + "step": 4002, + "time_per_iteration": 2.66536283493042 + }, + { + "auxiliary_loss_clip": 0.01169806, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.05230081, + "balance_loss_mlp": 1.02324176, + "epoch": 0.48133229122828114, + "flos": 27490264871040.0, + "grad_norm": 2.255906294504358, + "language_loss": 0.8202607, + "learning_rate": 2.2176799758948957e-06, + "loss": 0.84226793, + "num_input_tokens_seen": 86190955, + "step": 4003, + "time_per_iteration": 2.516401529312134 + }, + { + "auxiliary_loss_clip": 0.01149143, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.05103505, + "balance_loss_mlp": 1.02409852, + "epoch": 0.4814525341189202, + "flos": 43072802179200.0, + "grad_norm": 1.8202217279820037, + "language_loss": 0.73234135, + "learning_rate": 2.2169056154067635e-06, + "loss": 0.7541517, + "num_input_tokens_seen": 86214875, + "step": 4004, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.01171497, + "auxiliary_loss_mlp": 0.00762869, + "balance_loss_clip": 1.05524826, + "balance_loss_mlp": 1.00001323, + "epoch": 0.4815727770095593, + "flos": 24236901400320.0, + "grad_norm": 1.8220416222679847, + "language_loss": 0.82599258, + "learning_rate": 2.216131222014222e-06, + "loss": 0.84533632, + "num_input_tokens_seen": 86232950, + "step": 4005, + "time_per_iteration": 2.5012128353118896 + }, + { + "auxiliary_loss_clip": 0.01135469, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.0492537, + "balance_loss_mlp": 1.02397561, + "epoch": 0.4816930199001984, + "flos": 18113630100480.0, + "grad_norm": 2.1702239270510395, + "language_loss": 0.79992437, + "learning_rate": 2.2153567958347455e-06, + "loss": 0.82160366, + "num_input_tokens_seen": 86249160, + "step": 4006, + "time_per_iteration": 2.4960286617279053 + }, + { + "auxiliary_loss_clip": 0.0115401, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.05311728, + "balance_loss_mlp": 1.01799643, + "epoch": 0.48181326279083747, + "flos": 17274720983040.0, + "grad_norm": 2.036974211627786, + "language_loss": 0.80058861, + "learning_rate": 2.214582336985815e-06, + "loss": 0.82239139, + "num_input_tokens_seen": 86267060, + "step": 4007, + "time_per_iteration": 2.470130205154419 + }, + { + "auxiliary_loss_clip": 0.01144592, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.04900789, + "balance_loss_mlp": 1.02032948, + "epoch": 0.4819335056814766, + "flos": 14903252231040.0, + "grad_norm": 2.197255688775241, + "language_loss": 0.66410673, + "learning_rate": 2.2138078455849142e-06, + "loss": 0.68584371, + "num_input_tokens_seen": 86285055, + "step": 4008, + "time_per_iteration": 2.46886944770813 + }, + { + "auxiliary_loss_clip": 0.01175495, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.05457318, + "balance_loss_mlp": 1.02145362, + "epoch": 0.4820537485721157, + "flos": 19244888012160.0, + "grad_norm": 2.0553792454781026, + "language_loss": 0.78387129, + "learning_rate": 2.2130333217495334e-06, + "loss": 0.80591857, + "num_input_tokens_seen": 86304225, + "step": 4009, + "time_per_iteration": 2.4619975090026855 + }, + { + "auxiliary_loss_clip": 0.01150172, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.05039942, + "balance_loss_mlp": 1.01979709, + "epoch": 0.48217399146275475, + "flos": 16033791870720.0, + "grad_norm": 3.1064920092466664, + "language_loss": 0.67348146, + "learning_rate": 2.2122587655971665e-06, + "loss": 0.69526124, + "num_input_tokens_seen": 86319170, + "step": 4010, + "time_per_iteration": 2.5164031982421875 + }, + { + "auxiliary_loss_clip": 0.01154258, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.05053461, + "balance_loss_mlp": 1.02138257, + "epoch": 0.48229423435339386, + "flos": 24134197438080.0, + "grad_norm": 1.6605789856705067, + "language_loss": 0.6407125, + "learning_rate": 2.211484177245314e-06, + "loss": 0.66254425, + "num_input_tokens_seen": 86338760, + "step": 4011, + "time_per_iteration": 2.512725353240967 + }, + { + "auxiliary_loss_clip": 0.01185934, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.05541587, + "balance_loss_mlp": 1.02424407, + "epoch": 0.48241447724403297, + "flos": 23805435231360.0, + "grad_norm": 1.9295960021644893, + "language_loss": 0.71834379, + "learning_rate": 2.21070955681148e-06, + "loss": 0.74052781, + "num_input_tokens_seen": 86357865, + "step": 4012, + "time_per_iteration": 2.449613332748413 + }, + { + "auxiliary_loss_clip": 0.01132562, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.04869962, + "balance_loss_mlp": 1.02030432, + "epoch": 0.482534720134672, + "flos": 23110312256640.0, + "grad_norm": 1.5838853825164083, + "language_loss": 0.77872318, + "learning_rate": 2.209934904413174e-06, + "loss": 0.8003301, + "num_input_tokens_seen": 86379470, + "step": 4013, + "time_per_iteration": 2.5532708168029785 + }, + { + "auxiliary_loss_clip": 0.01104976, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.03781271, + "balance_loss_mlp": 1.01998472, + "epoch": 0.48265496302531113, + "flos": 20923819568640.0, + "grad_norm": 1.8990438432396708, + "language_loss": 0.71644902, + "learning_rate": 2.2091602201679095e-06, + "loss": 0.73778486, + "num_input_tokens_seen": 86399080, + "step": 4014, + "time_per_iteration": 2.612545967102051 + }, + { + "auxiliary_loss_clip": 0.01146133, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.05163813, + "balance_loss_mlp": 1.01771331, + "epoch": 0.48277520591595025, + "flos": 15231152511360.0, + "grad_norm": 2.346585112815046, + "language_loss": 0.83471477, + "learning_rate": 2.208385504193206e-06, + "loss": 0.85643142, + "num_input_tokens_seen": 86416580, + "step": 4015, + "time_per_iteration": 2.585113763809204 + }, + { + "auxiliary_loss_clip": 0.0118359, + "auxiliary_loss_mlp": 0.01020524, + "balance_loss_clip": 1.0540421, + "balance_loss_mlp": 1.01265633, + "epoch": 0.4828954488065893, + "flos": 17858664385920.0, + "grad_norm": 1.8973272530851735, + "language_loss": 0.81183368, + "learning_rate": 2.2076107566065873e-06, + "loss": 0.83387482, + "num_input_tokens_seen": 86434365, + "step": 4016, + "time_per_iteration": 2.4063713550567627 + }, + { + "auxiliary_loss_clip": 0.01174544, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.05557704, + "balance_loss_mlp": 1.02471304, + "epoch": 0.4830156916972284, + "flos": 32087405070720.0, + "grad_norm": 2.1320415127706593, + "language_loss": 0.75556278, + "learning_rate": 2.2068359775255816e-06, + "loss": 0.77762794, + "num_input_tokens_seen": 86452675, + "step": 4017, + "time_per_iteration": 3.383261203765869 + }, + { + "auxiliary_loss_clip": 0.01121257, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.04743946, + "balance_loss_mlp": 1.02215123, + "epoch": 0.48313593458786747, + "flos": 21871717528320.0, + "grad_norm": 2.789894383423264, + "language_loss": 0.77914602, + "learning_rate": 2.206061167067723e-06, + "loss": 0.80066115, + "num_input_tokens_seen": 86470785, + "step": 4018, + "time_per_iteration": 2.5599255561828613 + }, + { + "auxiliary_loss_clip": 0.01137514, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.04612303, + "balance_loss_mlp": 1.01775455, + "epoch": 0.4832561774785066, + "flos": 22601206840320.0, + "grad_norm": 1.9769122285139347, + "language_loss": 0.79428267, + "learning_rate": 2.205286325350549e-06, + "loss": 0.81592333, + "num_input_tokens_seen": 86489850, + "step": 4019, + "time_per_iteration": 3.403686761856079 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.04873157, + "balance_loss_mlp": 1.0213325, + "epoch": 0.4833764203691457, + "flos": 13437342282240.0, + "grad_norm": 4.350399558320729, + "language_loss": 0.72452468, + "learning_rate": 2.204511452491603e-06, + "loss": 0.74608022, + "num_input_tokens_seen": 86506475, + "step": 4020, + "time_per_iteration": 2.5112051963806152 + }, + { + "auxiliary_loss_clip": 0.01182201, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.05754459, + "balance_loss_mlp": 1.02304745, + "epoch": 0.48349666325978474, + "flos": 44128036955520.0, + "grad_norm": 1.7958611125963242, + "language_loss": 0.74959695, + "learning_rate": 2.2037365486084316e-06, + "loss": 0.77172744, + "num_input_tokens_seen": 86529715, + "step": 4021, + "time_per_iteration": 3.460017442703247 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.04870439, + "balance_loss_mlp": 1.02190423, + "epoch": 0.48361690615042385, + "flos": 26028377245440.0, + "grad_norm": 2.422226814400363, + "language_loss": 0.78060877, + "learning_rate": 2.2029616138185886e-06, + "loss": 0.80241197, + "num_input_tokens_seen": 86548715, + "step": 4022, + "time_per_iteration": 2.5632619857788086 + }, + { + "auxiliary_loss_clip": 0.01143809, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.05586326, + "balance_loss_mlp": 1.02272809, + "epoch": 0.48373714904106296, + "flos": 22273306560000.0, + "grad_norm": 1.6361565300408787, + "language_loss": 0.82443738, + "learning_rate": 2.202186648239629e-06, + "loss": 0.84618121, + "num_input_tokens_seen": 86568650, + "step": 4023, + "time_per_iteration": 2.533316135406494 + }, + { + "auxiliary_loss_clip": 0.01169299, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.05483937, + "balance_loss_mlp": 1.02019751, + "epoch": 0.483857391931702, + "flos": 28292293699200.0, + "grad_norm": 1.7159135744511294, + "language_loss": 0.71730506, + "learning_rate": 2.201411651989117e-06, + "loss": 0.73927683, + "num_input_tokens_seen": 86590630, + "step": 4024, + "time_per_iteration": 3.3014490604400635 + }, + { + "auxiliary_loss_clip": 0.01157861, + "auxiliary_loss_mlp": 0.00762423, + "balance_loss_clip": 1.05555797, + "balance_loss_mlp": 1.00001144, + "epoch": 0.48397763482234113, + "flos": 27418048577280.0, + "grad_norm": 1.9426368383318275, + "language_loss": 0.7784096, + "learning_rate": 2.2006366251846167e-06, + "loss": 0.79761243, + "num_input_tokens_seen": 86611270, + "step": 4025, + "time_per_iteration": 2.5484061241149902 + }, + { + "auxiliary_loss_clip": 0.01156138, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.05429506, + "balance_loss_mlp": 1.02299428, + "epoch": 0.48409787771298024, + "flos": 16797252470400.0, + "grad_norm": 1.7070869882399438, + "language_loss": 0.75565135, + "learning_rate": 2.1998615679436997e-06, + "loss": 0.77751541, + "num_input_tokens_seen": 86628810, + "step": 4026, + "time_per_iteration": 2.4667627811431885 + }, + { + "auxiliary_loss_clip": 0.01165313, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.05495644, + "balance_loss_mlp": 1.0214715, + "epoch": 0.4842181206036193, + "flos": 25083496028160.0, + "grad_norm": 2.0833561273461214, + "language_loss": 0.77035522, + "learning_rate": 2.199086480383942e-06, + "loss": 0.79230851, + "num_input_tokens_seen": 86648185, + "step": 4027, + "time_per_iteration": 2.5388615131378174 + }, + { + "auxiliary_loss_clip": 0.01169003, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.05447793, + "balance_loss_mlp": 1.02546382, + "epoch": 0.4843383634942584, + "flos": 30372311496960.0, + "grad_norm": 3.164689330864456, + "language_loss": 0.67496449, + "learning_rate": 2.1983113626229234e-06, + "loss": 0.6969986, + "num_input_tokens_seen": 86667435, + "step": 4028, + "time_per_iteration": 2.5717720985412598 + }, + { + "auxiliary_loss_clip": 0.01137424, + "auxiliary_loss_mlp": 0.0076264, + "balance_loss_clip": 1.0501976, + "balance_loss_mlp": 1.00003588, + "epoch": 0.4844586063848975, + "flos": 20413564917120.0, + "grad_norm": 1.791005216862981, + "language_loss": 0.78418124, + "learning_rate": 2.1975362147782293e-06, + "loss": 0.80318189, + "num_input_tokens_seen": 86686630, + "step": 4029, + "time_per_iteration": 2.5636374950408936 + }, + { + "auxiliary_loss_clip": 0.010733, + "auxiliary_loss_mlp": 0.01002223, + "balance_loss_clip": 1.03510737, + "balance_loss_mlp": 1.00051832, + "epoch": 0.48457884927553657, + "flos": 70303722854400.0, + "grad_norm": 0.693168743604907, + "language_loss": 0.54143679, + "learning_rate": 2.196761036967448e-06, + "loss": 0.56219196, + "num_input_tokens_seen": 86754595, + "step": 4030, + "time_per_iteration": 3.2325029373168945 + }, + { + "auxiliary_loss_clip": 0.01165098, + "auxiliary_loss_mlp": 0.0102481, + "balance_loss_clip": 1.05240369, + "balance_loss_mlp": 1.01714468, + "epoch": 0.4846990921661757, + "flos": 19934516206080.0, + "grad_norm": 1.615163937148822, + "language_loss": 0.77574557, + "learning_rate": 2.1959858293081743e-06, + "loss": 0.79764467, + "num_input_tokens_seen": 86773730, + "step": 4031, + "time_per_iteration": 2.475933790206909 + }, + { + "auxiliary_loss_clip": 0.01139956, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.05192852, + "balance_loss_mlp": 1.01986337, + "epoch": 0.4848193350568148, + "flos": 23075945919360.0, + "grad_norm": 1.6198280526942903, + "language_loss": 0.76113212, + "learning_rate": 2.1952105919180056e-06, + "loss": 0.78281111, + "num_input_tokens_seen": 86792985, + "step": 4032, + "time_per_iteration": 2.5688636302948 + }, + { + "auxiliary_loss_clip": 0.01156093, + "auxiliary_loss_mlp": 0.0102073, + "balance_loss_clip": 1.05360687, + "balance_loss_mlp": 1.01236784, + "epoch": 0.48493957794745385, + "flos": 22455481363200.0, + "grad_norm": 2.257211362095846, + "language_loss": 0.67835373, + "learning_rate": 2.1944353249145456e-06, + "loss": 0.700122, + "num_input_tokens_seen": 86812095, + "step": 4033, + "time_per_iteration": 2.550600528717041 + }, + { + "auxiliary_loss_clip": 0.01185952, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.05857873, + "balance_loss_mlp": 1.02329826, + "epoch": 0.48505982083809296, + "flos": 25046112948480.0, + "grad_norm": 1.453879475665238, + "language_loss": 0.74531084, + "learning_rate": 2.193660028415401e-06, + "loss": 0.76747954, + "num_input_tokens_seen": 86832875, + "step": 4034, + "time_per_iteration": 2.4884185791015625 + }, + { + "auxiliary_loss_clip": 0.01147805, + "auxiliary_loss_mlp": 0.01023538, + "balance_loss_clip": 1.05176902, + "balance_loss_mlp": 1.01554513, + "epoch": 0.485180063728732, + "flos": 26761386090240.0, + "grad_norm": 2.288873035078123, + "language_loss": 0.82473969, + "learning_rate": 2.1928847025381852e-06, + "loss": 0.84645307, + "num_input_tokens_seen": 86853480, + "step": 4035, + "time_per_iteration": 2.567490577697754 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.05062187, + "balance_loss_mlp": 1.01819611, + "epoch": 0.4853003066193711, + "flos": 24059143969920.0, + "grad_norm": 1.647676469350279, + "language_loss": 0.83809036, + "learning_rate": 2.192109347400512e-06, + "loss": 0.8600412, + "num_input_tokens_seen": 86873695, + "step": 4036, + "time_per_iteration": 2.5026042461395264 + }, + { + "auxiliary_loss_clip": 0.01158305, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.05186319, + "balance_loss_mlp": 1.02411163, + "epoch": 0.48542054951001024, + "flos": 23076376882560.0, + "grad_norm": 1.6434095893706393, + "language_loss": 0.79051471, + "learning_rate": 2.191333963120004e-06, + "loss": 0.81242764, + "num_input_tokens_seen": 86892675, + "step": 4037, + "time_per_iteration": 2.514356851577759 + }, + { + "auxiliary_loss_clip": 0.01158109, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.05376327, + "balance_loss_mlp": 1.02360606, + "epoch": 0.4855407924006493, + "flos": 25664889565440.0, + "grad_norm": 2.4253763732347147, + "language_loss": 0.70460498, + "learning_rate": 2.190558549814286e-06, + "loss": 0.72650921, + "num_input_tokens_seen": 86912835, + "step": 4038, + "time_per_iteration": 2.537055492401123 + }, + { + "auxiliary_loss_clip": 0.01154999, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.05221272, + "balance_loss_mlp": 1.01881254, + "epoch": 0.4856610352912884, + "flos": 23987933256960.0, + "grad_norm": 1.7292565221305087, + "language_loss": 0.79308236, + "learning_rate": 2.1897831076009872e-06, + "loss": 0.81489921, + "num_input_tokens_seen": 86932475, + "step": 4039, + "time_per_iteration": 2.512850761413574 + }, + { + "auxiliary_loss_clip": 0.01170373, + "auxiliary_loss_mlp": 0.01025949, + "balance_loss_clip": 1.05404365, + "balance_loss_mlp": 1.01807523, + "epoch": 0.4857812781819275, + "flos": 24096814358400.0, + "grad_norm": 1.613813208040834, + "language_loss": 0.80333507, + "learning_rate": 2.1890076365977426e-06, + "loss": 0.82529831, + "num_input_tokens_seen": 86952300, + "step": 4040, + "time_per_iteration": 2.4895386695861816 + }, + { + "auxiliary_loss_clip": 0.01056137, + "auxiliary_loss_mlp": 0.01007984, + "balance_loss_clip": 1.01821899, + "balance_loss_mlp": 1.00675011, + "epoch": 0.48590152107256657, + "flos": 56266635185280.0, + "grad_norm": 0.8551880458968137, + "language_loss": 0.52817249, + "learning_rate": 2.188232136922189e-06, + "loss": 0.5488137, + "num_input_tokens_seen": 87010420, + "step": 4041, + "time_per_iteration": 3.0015077590942383 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.04953575, + "balance_loss_mlp": 1.0224216, + "epoch": 0.4860217639632057, + "flos": 20046988667520.0, + "grad_norm": 1.843162427098352, + "language_loss": 0.75431663, + "learning_rate": 2.187456608691971e-06, + "loss": 0.77574092, + "num_input_tokens_seen": 87029295, + "step": 4042, + "time_per_iteration": 2.5970029830932617 + }, + { + "auxiliary_loss_clip": 0.01149956, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.05697215, + "balance_loss_mlp": 1.02371144, + "epoch": 0.4861420068538448, + "flos": 17822143232640.0, + "grad_norm": 1.8753014925101452, + "language_loss": 0.87677664, + "learning_rate": 2.1866810520247334e-06, + "loss": 0.89859855, + "num_input_tokens_seen": 87048165, + "step": 4043, + "time_per_iteration": 2.5197627544403076 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.01024559, + "balance_loss_clip": 1.05332828, + "balance_loss_mlp": 1.01584458, + "epoch": 0.48626224974448384, + "flos": 26250125857920.0, + "grad_norm": 2.3554491165112736, + "language_loss": 0.64988488, + "learning_rate": 2.185905467038129e-06, + "loss": 0.67188048, + "num_input_tokens_seen": 87067070, + "step": 4044, + "time_per_iteration": 3.413688898086548 + }, + { + "auxiliary_loss_clip": 0.01183121, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.05732, + "balance_loss_mlp": 1.01768053, + "epoch": 0.48638249263512295, + "flos": 22054502862720.0, + "grad_norm": 2.0463745882145945, + "language_loss": 0.77482259, + "learning_rate": 2.1851298538498127e-06, + "loss": 0.7969088, + "num_input_tokens_seen": 87086785, + "step": 4045, + "time_per_iteration": 2.4491310119628906 + }, + { + "auxiliary_loss_clip": 0.01177968, + "auxiliary_loss_mlp": 0.00763345, + "balance_loss_clip": 1.05739164, + "balance_loss_mlp": 1.00016522, + "epoch": 0.48650273552576206, + "flos": 25119945354240.0, + "grad_norm": 1.8469257868097886, + "language_loss": 0.79962122, + "learning_rate": 2.184354212577446e-06, + "loss": 0.81903434, + "num_input_tokens_seen": 87107090, + "step": 4046, + "time_per_iteration": 3.3428757190704346 + }, + { + "auxiliary_loss_clip": 0.01187616, + "auxiliary_loss_mlp": 0.01026972, + "balance_loss_clip": 1.05630851, + "balance_loss_mlp": 1.01869869, + "epoch": 0.4866229784164011, + "flos": 17456931699840.0, + "grad_norm": 4.218166042504802, + "language_loss": 0.62811887, + "learning_rate": 2.1835785433386907e-06, + "loss": 0.65026474, + "num_input_tokens_seen": 87125905, + "step": 4047, + "time_per_iteration": 3.2227940559387207 + }, + { + "auxiliary_loss_clip": 0.01132377, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.05056989, + "balance_loss_mlp": 1.02305818, + "epoch": 0.48674322130704023, + "flos": 23331127115520.0, + "grad_norm": 1.8853798710415162, + "language_loss": 0.65365708, + "learning_rate": 2.182802846251216e-06, + "loss": 0.67529476, + "num_input_tokens_seen": 87146175, + "step": 4048, + "time_per_iteration": 2.538496494293213 + }, + { + "auxiliary_loss_clip": 0.01147126, + "auxiliary_loss_mlp": 0.0102621, + "balance_loss_clip": 1.05112743, + "balance_loss_mlp": 1.01804423, + "epoch": 0.4868634641976793, + "flos": 28804344030720.0, + "grad_norm": 2.2938470278100827, + "language_loss": 0.72392857, + "learning_rate": 2.182027121432696e-06, + "loss": 0.74566191, + "num_input_tokens_seen": 87166800, + "step": 4049, + "time_per_iteration": 2.5950021743774414 + }, + { + "auxiliary_loss_clip": 0.01188154, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.05713403, + "balance_loss_mlp": 1.02131259, + "epoch": 0.4869837070883184, + "flos": 19025976574080.0, + "grad_norm": 1.7345402287166836, + "language_loss": 0.82433403, + "learning_rate": 2.1812513690008054e-06, + "loss": 0.84652013, + "num_input_tokens_seen": 87185920, + "step": 4050, + "time_per_iteration": 2.4374701976776123 + }, + { + "auxiliary_loss_clip": 0.01178251, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.05663967, + "balance_loss_mlp": 1.01998472, + "epoch": 0.4871039499789575, + "flos": 15121409483520.0, + "grad_norm": 2.0933243825728116, + "language_loss": 0.79741073, + "learning_rate": 2.180475589073227e-06, + "loss": 0.8194803, + "num_input_tokens_seen": 87203620, + "step": 4051, + "time_per_iteration": 3.2058520317077637 + }, + { + "auxiliary_loss_clip": 0.0116048, + "auxiliary_loss_mlp": 0.01024475, + "balance_loss_clip": 1.05248952, + "balance_loss_mlp": 1.01639235, + "epoch": 0.48722419286959656, + "flos": 26174066808960.0, + "grad_norm": 1.59038909181476, + "language_loss": 0.73389429, + "learning_rate": 2.1796997817676456e-06, + "loss": 0.75574374, + "num_input_tokens_seen": 87224630, + "step": 4052, + "time_per_iteration": 2.5167155265808105 + }, + { + "auxiliary_loss_clip": 0.01172781, + "auxiliary_loss_mlp": 0.00762206, + "balance_loss_clip": 1.0552392, + "balance_loss_mlp": 1.00008011, + "epoch": 0.4873444357602357, + "flos": 24026142349440.0, + "grad_norm": 1.5389986628754269, + "language_loss": 0.67266291, + "learning_rate": 2.1789239472017494e-06, + "loss": 0.69201279, + "num_input_tokens_seen": 87246280, + "step": 4053, + "time_per_iteration": 2.5113210678100586 + }, + { + "auxiliary_loss_clip": 0.0114349, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.05210698, + "balance_loss_mlp": 1.02468824, + "epoch": 0.4874646786508748, + "flos": 22820441500800.0, + "grad_norm": 1.9762733860961847, + "language_loss": 0.7297858, + "learning_rate": 2.1781480854932326e-06, + "loss": 0.75155276, + "num_input_tokens_seen": 87266045, + "step": 4054, + "time_per_iteration": 2.5399374961853027 + }, + { + "auxiliary_loss_clip": 0.01126496, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.05019116, + "balance_loss_mlp": 1.02375579, + "epoch": 0.48758492154151384, + "flos": 21287594557440.0, + "grad_norm": 1.8853190002157212, + "language_loss": 0.79476541, + "learning_rate": 2.1773721967597933e-06, + "loss": 0.81634605, + "num_input_tokens_seen": 87284495, + "step": 4055, + "time_per_iteration": 2.597148895263672 + }, + { + "auxiliary_loss_clip": 0.01049061, + "auxiliary_loss_mlp": 0.01001406, + "balance_loss_clip": 1.01509893, + "balance_loss_mlp": 1.00019598, + "epoch": 0.48770516443215295, + "flos": 62244109180800.0, + "grad_norm": 0.8504286983227286, + "language_loss": 0.57372952, + "learning_rate": 2.1765962811191322e-06, + "loss": 0.59423423, + "num_input_tokens_seen": 87338960, + "step": 4056, + "time_per_iteration": 2.999025344848633 + }, + { + "auxiliary_loss_clip": 0.01037147, + "auxiliary_loss_mlp": 0.0100351, + "balance_loss_clip": 1.02121603, + "balance_loss_mlp": 1.00238371, + "epoch": 0.48782540732279206, + "flos": 66133451882880.0, + "grad_norm": 0.8236176291206496, + "language_loss": 0.62008166, + "learning_rate": 2.1758203386889566e-06, + "loss": 0.64048827, + "num_input_tokens_seen": 87401730, + "step": 4057, + "time_per_iteration": 3.179783821105957 + }, + { + "auxiliary_loss_clip": 0.01140356, + "auxiliary_loss_mlp": 0.00762801, + "balance_loss_clip": 1.04939401, + "balance_loss_mlp": 1.00007892, + "epoch": 0.4879456502134311, + "flos": 14607922608000.0, + "grad_norm": 2.185603282625709, + "language_loss": 0.84332621, + "learning_rate": 2.1750443695869746e-06, + "loss": 0.8623578, + "num_input_tokens_seen": 87417300, + "step": 4058, + "time_per_iteration": 2.5328805446624756 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.05412626, + "balance_loss_mlp": 1.02167809, + "epoch": 0.4880658931040702, + "flos": 19500464257920.0, + "grad_norm": 2.0124907170253756, + "language_loss": 0.85885882, + "learning_rate": 2.174268373930901e-06, + "loss": 0.88087946, + "num_input_tokens_seen": 87434815, + "step": 4059, + "time_per_iteration": 2.495298385620117 + }, + { + "auxiliary_loss_clip": 0.01137662, + "auxiliary_loss_mlp": 0.00763332, + "balance_loss_clip": 1.05353796, + "balance_loss_mlp": 1.00006366, + "epoch": 0.48818613599470934, + "flos": 16723060928640.0, + "grad_norm": 2.077818884417373, + "language_loss": 0.79981762, + "learning_rate": 2.1734923518384537e-06, + "loss": 0.81882757, + "num_input_tokens_seen": 87451420, + "step": 4060, + "time_per_iteration": 2.490832805633545 + }, + { + "auxiliary_loss_clip": 0.01127164, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.05108762, + "balance_loss_mlp": 1.02489567, + "epoch": 0.4883063788853484, + "flos": 26756932803840.0, + "grad_norm": 1.7850121293091334, + "language_loss": 0.82266873, + "learning_rate": 2.1727163034273547e-06, + "loss": 0.84426749, + "num_input_tokens_seen": 87469585, + "step": 4061, + "time_per_iteration": 2.587679624557495 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01024303, + "balance_loss_clip": 1.05345392, + "balance_loss_mlp": 1.01585746, + "epoch": 0.4884266217759875, + "flos": 16763388923520.0, + "grad_norm": 2.4614690520949862, + "language_loss": 0.79285216, + "learning_rate": 2.17194022881533e-06, + "loss": 0.81482452, + "num_input_tokens_seen": 87485675, + "step": 4062, + "time_per_iteration": 2.4265897274017334 + }, + { + "auxiliary_loss_clip": 0.01158876, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.05140376, + "balance_loss_mlp": 1.02390909, + "epoch": 0.4885468646666266, + "flos": 24207132003840.0, + "grad_norm": 6.143432545327328, + "language_loss": 0.67689884, + "learning_rate": 2.1711641281201092e-06, + "loss": 0.69881183, + "num_input_tokens_seen": 87505605, + "step": 4063, + "time_per_iteration": 2.533766508102417 + }, + { + "auxiliary_loss_clip": 0.01168889, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.05443549, + "balance_loss_mlp": 1.01848829, + "epoch": 0.48866710755726567, + "flos": 14610795696000.0, + "grad_norm": 2.0452451851030835, + "language_loss": 0.79497457, + "learning_rate": 2.1703880014594264e-06, + "loss": 0.81693, + "num_input_tokens_seen": 87523195, + "step": 4064, + "time_per_iteration": 2.443293333053589 + }, + { + "auxiliary_loss_clip": 0.01124908, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.05432439, + "balance_loss_mlp": 1.02397454, + "epoch": 0.4887873504479048, + "flos": 28804451771520.0, + "grad_norm": 2.367919336367914, + "language_loss": 0.73787099, + "learning_rate": 2.1696118489510182e-06, + "loss": 0.75943494, + "num_input_tokens_seen": 87544125, + "step": 4065, + "time_per_iteration": 2.6290082931518555 + }, + { + "auxiliary_loss_clip": 0.01147638, + "auxiliary_loss_mlp": 0.00763423, + "balance_loss_clip": 1.0521009, + "balance_loss_mlp": 1.00004148, + "epoch": 0.48890759333854383, + "flos": 22784387224320.0, + "grad_norm": 1.7240674376886689, + "language_loss": 0.72531772, + "learning_rate": 2.1688356707126286e-06, + "loss": 0.74442828, + "num_input_tokens_seen": 87563745, + "step": 4066, + "time_per_iteration": 2.549661159515381 + }, + { + "auxiliary_loss_clip": 0.01137688, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.04970789, + "balance_loss_mlp": 1.02037561, + "epoch": 0.48902783622918294, + "flos": 17786088956160.0, + "grad_norm": 3.0225175160789726, + "language_loss": 0.70128107, + "learning_rate": 2.168059466862001e-06, + "loss": 0.72294563, + "num_input_tokens_seen": 87581895, + "step": 4067, + "time_per_iteration": 2.527517318725586 + }, + { + "auxiliary_loss_clip": 0.01155495, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.04899478, + "balance_loss_mlp": 1.0198009, + "epoch": 0.48914807911982205, + "flos": 22310294590080.0, + "grad_norm": 1.9855676841262162, + "language_loss": 0.8183217, + "learning_rate": 2.167283237516887e-06, + "loss": 0.84015083, + "num_input_tokens_seen": 87600170, + "step": 4068, + "time_per_iteration": 2.5178065299987793 + }, + { + "auxiliary_loss_clip": 0.01159729, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.05256808, + "balance_loss_mlp": 1.02556229, + "epoch": 0.4892683220104611, + "flos": 16363020954240.0, + "grad_norm": 1.7547126209766728, + "language_loss": 0.74762404, + "learning_rate": 2.1665069827950383e-06, + "loss": 0.76955891, + "num_input_tokens_seen": 87617455, + "step": 4069, + "time_per_iteration": 2.485710859298706 + }, + { + "auxiliary_loss_clip": 0.01157013, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.05280459, + "balance_loss_mlp": 1.01913369, + "epoch": 0.4893885649011002, + "flos": 15739144606080.0, + "grad_norm": 2.3863527753022313, + "language_loss": 0.86746252, + "learning_rate": 2.1657307028142126e-06, + "loss": 0.88930064, + "num_input_tokens_seen": 87634995, + "step": 4070, + "time_per_iteration": 3.367102861404419 + }, + { + "auxiliary_loss_clip": 0.01156453, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.05322647, + "balance_loss_mlp": 1.02535522, + "epoch": 0.48950880779173933, + "flos": 28581984887040.0, + "grad_norm": 3.149404413788562, + "language_loss": 0.67488945, + "learning_rate": 2.164954397692171e-06, + "loss": 0.69679743, + "num_input_tokens_seen": 87654420, + "step": 4071, + "time_per_iteration": 2.571049690246582 + }, + { + "auxiliary_loss_clip": 0.0106145, + "auxiliary_loss_mlp": 0.01003267, + "balance_loss_clip": 1.01723826, + "balance_loss_mlp": 1.00205135, + "epoch": 0.4896290506823784, + "flos": 66186310746240.0, + "grad_norm": 1.0803737716027464, + "language_loss": 0.77365255, + "learning_rate": 2.164178067546678e-06, + "loss": 0.79429972, + "num_input_tokens_seen": 87713585, + "step": 4072, + "time_per_iteration": 3.135568857192993 + }, + { + "auxiliary_loss_clip": 0.01159518, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.04995441, + "balance_loss_mlp": 1.02207875, + "epoch": 0.4897492935730175, + "flos": 12531065207040.0, + "grad_norm": 1.9436197279462866, + "language_loss": 0.90988368, + "learning_rate": 2.163401712495504e-06, + "loss": 0.93178332, + "num_input_tokens_seen": 87731280, + "step": 4073, + "time_per_iteration": 3.352949380874634 + }, + { + "auxiliary_loss_clip": 0.01135533, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.05201387, + "balance_loss_mlp": 1.02802885, + "epoch": 0.4898695364636566, + "flos": 23476816679040.0, + "grad_norm": 1.509612098462676, + "language_loss": 0.79023528, + "learning_rate": 2.1626253326564194e-06, + "loss": 0.81195647, + "num_input_tokens_seen": 87750230, + "step": 4074, + "time_per_iteration": 3.3941211700439453 + }, + { + "auxiliary_loss_clip": 0.01153414, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.04936397, + "balance_loss_mlp": 1.02222383, + "epoch": 0.48998977935429566, + "flos": 27160209774720.0, + "grad_norm": 3.837295918962431, + "language_loss": 0.76680166, + "learning_rate": 2.161848928147201e-06, + "loss": 0.78864217, + "num_input_tokens_seen": 87770500, + "step": 4075, + "time_per_iteration": 2.560128927230835 + }, + { + "auxiliary_loss_clip": 0.01171607, + "auxiliary_loss_mlp": 0.01026177, + "balance_loss_clip": 1.05549455, + "balance_loss_mlp": 1.01795769, + "epoch": 0.4901100222449348, + "flos": 20339588856960.0, + "grad_norm": 1.970392440756129, + "language_loss": 0.8081584, + "learning_rate": 2.161072499085629e-06, + "loss": 0.8301363, + "num_input_tokens_seen": 87789495, + "step": 4076, + "time_per_iteration": 2.478710651397705 + }, + { + "auxiliary_loss_clip": 0.01147646, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.05219328, + "balance_loss_mlp": 1.01894557, + "epoch": 0.4902302651355739, + "flos": 30446359384320.0, + "grad_norm": 1.8004402661580634, + "language_loss": 0.83366829, + "learning_rate": 2.160296045589487e-06, + "loss": 0.8554132, + "num_input_tokens_seen": 87812955, + "step": 4077, + "time_per_iteration": 2.6417996883392334 + }, + { + "auxiliary_loss_clip": 0.01168822, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.05364525, + "balance_loss_mlp": 1.01617169, + "epoch": 0.49035050802621294, + "flos": 19174180089600.0, + "grad_norm": 2.377398819649, + "language_loss": 0.69271755, + "learning_rate": 2.159519567776562e-06, + "loss": 0.71465135, + "num_input_tokens_seen": 87832605, + "step": 4078, + "time_per_iteration": 3.205803632736206 + }, + { + "auxiliary_loss_clip": 0.01129338, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.04453564, + "balance_loss_mlp": 1.01866043, + "epoch": 0.49047075091685205, + "flos": 22228489365120.0, + "grad_norm": 2.6030239595589744, + "language_loss": 0.70642829, + "learning_rate": 2.1587430657646463e-06, + "loss": 0.72799218, + "num_input_tokens_seen": 87846040, + "step": 4079, + "time_per_iteration": 2.547797679901123 + }, + { + "auxiliary_loss_clip": 0.01154888, + "auxiliary_loss_mlp": 0.01024846, + "balance_loss_clip": 1.05300307, + "balance_loss_mlp": 1.01694822, + "epoch": 0.4905909938074911, + "flos": 20156516213760.0, + "grad_norm": 1.9767102803922114, + "language_loss": 0.77868295, + "learning_rate": 2.157966539671533e-06, + "loss": 0.80048025, + "num_input_tokens_seen": 87865680, + "step": 4080, + "time_per_iteration": 2.5252723693847656 + }, + { + "auxiliary_loss_clip": 0.0114174, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.04926026, + "balance_loss_mlp": 1.01799011, + "epoch": 0.4907112366981302, + "flos": 17202217380480.0, + "grad_norm": 2.0091646715340072, + "language_loss": 0.6726476, + "learning_rate": 2.157189989615021e-06, + "loss": 0.69432056, + "num_input_tokens_seen": 87884270, + "step": 4081, + "time_per_iteration": 2.5656538009643555 + }, + { + "auxiliary_loss_clip": 0.01170981, + "auxiliary_loss_mlp": 0.00763178, + "balance_loss_clip": 1.0515815, + "balance_loss_mlp": 1.00011992, + "epoch": 0.4908314795887693, + "flos": 21688968107520.0, + "grad_norm": 2.211378102152233, + "language_loss": 0.75354463, + "learning_rate": 2.156413415712913e-06, + "loss": 0.77288622, + "num_input_tokens_seen": 87906320, + "step": 4082, + "time_per_iteration": 2.5558478832244873 + }, + { + "auxiliary_loss_clip": 0.01163424, + "auxiliary_loss_mlp": 0.00763654, + "balance_loss_clip": 1.05446339, + "balance_loss_mlp": 1.00014174, + "epoch": 0.4909517224794084, + "flos": 26213676531840.0, + "grad_norm": 1.6783046842293747, + "language_loss": 0.78429568, + "learning_rate": 2.155636818083014e-06, + "loss": 0.80356646, + "num_input_tokens_seen": 87927690, + "step": 4083, + "time_per_iteration": 2.538339853286743 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01024173, + "balance_loss_clip": 1.05390143, + "balance_loss_mlp": 1.01689482, + "epoch": 0.4910719653700475, + "flos": 23148377694720.0, + "grad_norm": 1.8961320654349467, + "language_loss": 0.84105182, + "learning_rate": 2.154860196843134e-06, + "loss": 0.86283529, + "num_input_tokens_seen": 87946885, + "step": 4084, + "time_per_iteration": 2.513349771499634 + }, + { + "auxiliary_loss_clip": 0.0118527, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.05567265, + "balance_loss_mlp": 1.02063835, + "epoch": 0.4911922082606866, + "flos": 23331845387520.0, + "grad_norm": 1.6832848412475891, + "language_loss": 0.76682121, + "learning_rate": 2.154083552111085e-06, + "loss": 0.78895926, + "num_input_tokens_seen": 87966055, + "step": 4085, + "time_per_iteration": 2.453343152999878 + }, + { + "auxiliary_loss_clip": 0.01185093, + "auxiliary_loss_mlp": 0.01027101, + "balance_loss_clip": 1.05306721, + "balance_loss_mlp": 1.01885223, + "epoch": 0.49131245115132566, + "flos": 29203239542400.0, + "grad_norm": 1.943740418708593, + "language_loss": 0.81703532, + "learning_rate": 2.1533068840046834e-06, + "loss": 0.83915728, + "num_input_tokens_seen": 87986320, + "step": 4086, + "time_per_iteration": 2.488363742828369 + }, + { + "auxiliary_loss_clip": 0.01149195, + "auxiliary_loss_mlp": 0.00763245, + "balance_loss_clip": 1.05005085, + "balance_loss_mlp": 1.00013399, + "epoch": 0.49143269404196477, + "flos": 20147465986560.0, + "grad_norm": 2.328205497385505, + "language_loss": 0.61776161, + "learning_rate": 2.152530192641749e-06, + "loss": 0.636886, + "num_input_tokens_seen": 88001230, + "step": 4087, + "time_per_iteration": 2.4580159187316895 + }, + { + "auxiliary_loss_clip": 0.01172731, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.05240154, + "balance_loss_mlp": 1.0245502, + "epoch": 0.4915529369326039, + "flos": 24389809597440.0, + "grad_norm": 1.8741599549456955, + "language_loss": 0.72298652, + "learning_rate": 2.1517534781401068e-06, + "loss": 0.74504244, + "num_input_tokens_seen": 88019110, + "step": 4088, + "time_per_iteration": 2.491710901260376 + }, + { + "auxiliary_loss_clip": 0.01169037, + "auxiliary_loss_mlp": 0.01024429, + "balance_loss_clip": 1.0533483, + "balance_loss_mlp": 1.01623356, + "epoch": 0.49167317982324293, + "flos": 10524305197440.0, + "grad_norm": 2.5346007002992805, + "language_loss": 0.69590163, + "learning_rate": 2.150976740617581e-06, + "loss": 0.71783626, + "num_input_tokens_seen": 88035670, + "step": 4089, + "time_per_iteration": 2.45009708404541 + }, + { + "auxiliary_loss_clip": 0.01161298, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.05377293, + "balance_loss_mlp": 1.02201641, + "epoch": 0.49179342271388204, + "flos": 25593427457280.0, + "grad_norm": 1.8633110249632234, + "language_loss": 0.71191609, + "learning_rate": 2.150199980192006e-06, + "loss": 0.73383164, + "num_input_tokens_seen": 88054790, + "step": 4090, + "time_per_iteration": 2.5462887287139893 + }, + { + "auxiliary_loss_clip": 0.0114873, + "auxiliary_loss_mlp": 0.01024441, + "balance_loss_clip": 1.04979455, + "balance_loss_mlp": 1.0166862, + "epoch": 0.49191366560452116, + "flos": 21102043875840.0, + "grad_norm": 1.6342879001809376, + "language_loss": 0.80829811, + "learning_rate": 2.1494231969812114e-06, + "loss": 0.83002979, + "num_input_tokens_seen": 88073780, + "step": 4091, + "time_per_iteration": 2.471823215484619 + }, + { + "auxiliary_loss_clip": 0.01148388, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.05338955, + "balance_loss_mlp": 1.02001905, + "epoch": 0.4920339084951602, + "flos": 26067520091520.0, + "grad_norm": 2.4557446123295015, + "language_loss": 0.81295007, + "learning_rate": 2.1486463911030372e-06, + "loss": 0.83471942, + "num_input_tokens_seen": 88094430, + "step": 4092, + "time_per_iteration": 2.581411361694336 + }, + { + "auxiliary_loss_clip": 0.01151775, + "auxiliary_loss_mlp": 0.01031163, + "balance_loss_clip": 1.04779994, + "balance_loss_mlp": 1.02317047, + "epoch": 0.4921541513857993, + "flos": 25081269384960.0, + "grad_norm": 1.8172882285606748, + "language_loss": 0.74467409, + "learning_rate": 2.147869562675324e-06, + "loss": 0.76650345, + "num_input_tokens_seen": 88113400, + "step": 4093, + "time_per_iteration": 2.5447893142700195 + }, + { + "auxiliary_loss_clip": 0.01169473, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.05332804, + "balance_loss_mlp": 1.01990676, + "epoch": 0.49227439427643843, + "flos": 24389809597440.0, + "grad_norm": 1.7643488611528046, + "language_loss": 0.72301012, + "learning_rate": 2.147092711815915e-06, + "loss": 0.74498451, + "num_input_tokens_seen": 88132750, + "step": 4094, + "time_per_iteration": 2.4886093139648438 + }, + { + "auxiliary_loss_clip": 0.01141163, + "auxiliary_loss_mlp": 0.01022179, + "balance_loss_clip": 1.05231571, + "balance_loss_mlp": 1.01460588, + "epoch": 0.4923946371670775, + "flos": 11363753018880.0, + "grad_norm": 2.443281012314946, + "language_loss": 0.8587079, + "learning_rate": 2.1463158386426593e-06, + "loss": 0.88034129, + "num_input_tokens_seen": 88150560, + "step": 4095, + "time_per_iteration": 2.4847512245178223 + }, + { + "auxiliary_loss_clip": 0.01163391, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.05332351, + "balance_loss_mlp": 1.02148676, + "epoch": 0.4925148800577166, + "flos": 30445964334720.0, + "grad_norm": 1.9821149252294032, + "language_loss": 0.7748816, + "learning_rate": 2.145538943273407e-06, + "loss": 0.7968151, + "num_input_tokens_seen": 88170835, + "step": 4096, + "time_per_iteration": 2.5843424797058105 + }, + { + "auxiliary_loss_clip": 0.01186129, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.05723906, + "balance_loss_mlp": 1.02219796, + "epoch": 0.49263512294835565, + "flos": 20850454039680.0, + "grad_norm": 2.932051404567651, + "language_loss": 0.71612585, + "learning_rate": 2.144762025826013e-06, + "loss": 0.73828828, + "num_input_tokens_seen": 88189925, + "step": 4097, + "time_per_iteration": 3.275627374649048 + }, + { + "auxiliary_loss_clip": 0.01174187, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.05285192, + "balance_loss_mlp": 1.02218711, + "epoch": 0.49275536583899476, + "flos": 23767477534080.0, + "grad_norm": 2.255771670245462, + "language_loss": 0.86852419, + "learning_rate": 2.143985086418334e-06, + "loss": 0.89056945, + "num_input_tokens_seen": 88205105, + "step": 4098, + "time_per_iteration": 2.450673818588257 + }, + { + "auxiliary_loss_clip": 0.0115596, + "auxiliary_loss_mlp": 0.01022018, + "balance_loss_clip": 1.05104613, + "balance_loss_mlp": 1.01432848, + "epoch": 0.4928756087296339, + "flos": 22273522041600.0, + "grad_norm": 1.4335540950301706, + "language_loss": 0.76434034, + "learning_rate": 2.1432081251682324e-06, + "loss": 0.78612012, + "num_input_tokens_seen": 88225475, + "step": 4099, + "time_per_iteration": 3.343742847442627 + }, + { + "auxiliary_loss_clip": 0.01176135, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.06125724, + "balance_loss_mlp": 1.0181222, + "epoch": 0.49299585162027293, + "flos": 19645471463040.0, + "grad_norm": 2.216194006508457, + "language_loss": 0.87102968, + "learning_rate": 2.142431142193572e-06, + "loss": 0.89306086, + "num_input_tokens_seen": 88243255, + "step": 4100, + "time_per_iteration": 3.276048183441162 + }, + { + "auxiliary_loss_clip": 0.011841, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.0565002, + "balance_loss_mlp": 1.02131426, + "epoch": 0.49311609451091204, + "flos": 38837138497920.0, + "grad_norm": 2.299635026533799, + "language_loss": 0.7161544, + "learning_rate": 2.1416541376122207e-06, + "loss": 0.73829168, + "num_input_tokens_seen": 88263435, + "step": 4101, + "time_per_iteration": 2.579281806945801 + }, + { + "auxiliary_loss_clip": 0.01182973, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.0530324, + "balance_loss_mlp": 1.02055156, + "epoch": 0.49323633740155115, + "flos": 28329102161280.0, + "grad_norm": 1.7028832097159055, + "language_loss": 0.73146141, + "learning_rate": 2.1408771115420496e-06, + "loss": 0.75358522, + "num_input_tokens_seen": 88283295, + "step": 4102, + "time_per_iteration": 2.47206711769104 + }, + { + "auxiliary_loss_clip": 0.01130977, + "auxiliary_loss_mlp": 0.01029036, + "balance_loss_clip": 1.05464578, + "balance_loss_mlp": 1.02138281, + "epoch": 0.4933565802921902, + "flos": 21135584200320.0, + "grad_norm": 2.000212652467221, + "language_loss": 0.64809251, + "learning_rate": 2.140100064100932e-06, + "loss": 0.66969264, + "num_input_tokens_seen": 88299270, + "step": 4103, + "time_per_iteration": 2.577216386795044 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01023223, + "balance_loss_clip": 1.05257654, + "balance_loss_mlp": 1.01554906, + "epoch": 0.4934768231828293, + "flos": 18039007595520.0, + "grad_norm": 1.821748772083299, + "language_loss": 0.75829285, + "learning_rate": 2.139322995406746e-06, + "loss": 0.78018957, + "num_input_tokens_seen": 88316905, + "step": 4104, + "time_per_iteration": 3.143646478652954 + }, + { + "auxiliary_loss_clip": 0.01187829, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.05776906, + "balance_loss_mlp": 1.02409816, + "epoch": 0.4935970660734684, + "flos": 23469957181440.0, + "grad_norm": 1.8561560074483614, + "language_loss": 0.79598165, + "learning_rate": 2.1385459055773727e-06, + "loss": 0.81818759, + "num_input_tokens_seen": 88335095, + "step": 4105, + "time_per_iteration": 2.4261133670806885 + }, + { + "auxiliary_loss_clip": 0.011128, + "auxiliary_loss_mlp": 0.00762647, + "balance_loss_clip": 1.04559731, + "balance_loss_mlp": 1.00013137, + "epoch": 0.4937173089641075, + "flos": 64479258840960.0, + "grad_norm": 2.1145639537237235, + "language_loss": 0.73897099, + "learning_rate": 2.137768794730696e-06, + "loss": 0.75772548, + "num_input_tokens_seen": 88358545, + "step": 4106, + "time_per_iteration": 2.948566436767578 + }, + { + "auxiliary_loss_clip": 0.01160698, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.0540297, + "balance_loss_mlp": 1.02558064, + "epoch": 0.4938375518547466, + "flos": 22346025644160.0, + "grad_norm": 1.7450999679347463, + "language_loss": 0.80337608, + "learning_rate": 2.1369916629846026e-06, + "loss": 0.82532454, + "num_input_tokens_seen": 88378295, + "step": 4107, + "time_per_iteration": 2.536890983581543 + }, + { + "auxiliary_loss_clip": 0.0115413, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.04833758, + "balance_loss_mlp": 1.01715827, + "epoch": 0.4939577947453857, + "flos": 17858700299520.0, + "grad_norm": 1.8202031863284833, + "language_loss": 0.75075233, + "learning_rate": 2.136214510456983e-06, + "loss": 0.77254331, + "num_input_tokens_seen": 88396750, + "step": 4108, + "time_per_iteration": 2.6008787155151367 + }, + { + "auxiliary_loss_clip": 0.01048722, + "auxiliary_loss_mlp": 0.00753078, + "balance_loss_clip": 1.02279603, + "balance_loss_mlp": 1.00006735, + "epoch": 0.49407803763602476, + "flos": 70066746875520.0, + "grad_norm": 0.8904927835101719, + "language_loss": 0.63189888, + "learning_rate": 2.1354373372657296e-06, + "loss": 0.64991689, + "num_input_tokens_seen": 88455190, + "step": 4109, + "time_per_iteration": 3.1587002277374268 + }, + { + "auxiliary_loss_clip": 0.01183679, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.05637908, + "balance_loss_mlp": 1.02154064, + "epoch": 0.49419828052666387, + "flos": 24317485562880.0, + "grad_norm": 1.9446445796968632, + "language_loss": 0.71029574, + "learning_rate": 2.1346601435287404e-06, + "loss": 0.7324214, + "num_input_tokens_seen": 88477460, + "step": 4110, + "time_per_iteration": 2.5125043392181396 + }, + { + "auxiliary_loss_clip": 0.0115357, + "auxiliary_loss_mlp": 0.01026553, + "balance_loss_clip": 1.04996765, + "balance_loss_mlp": 1.01864958, + "epoch": 0.494318523417303, + "flos": 29386060790400.0, + "grad_norm": 2.0056009138406687, + "language_loss": 0.80229628, + "learning_rate": 2.1338829293639144e-06, + "loss": 0.82409751, + "num_input_tokens_seen": 88497820, + "step": 4111, + "time_per_iteration": 2.5772926807403564 + }, + { + "auxiliary_loss_clip": 0.01127871, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.05036068, + "balance_loss_mlp": 1.02529407, + "epoch": 0.49443876630794203, + "flos": 15268284195840.0, + "grad_norm": 1.9464909524123395, + "language_loss": 0.82966781, + "learning_rate": 2.1331056948891547e-06, + "loss": 0.8512857, + "num_input_tokens_seen": 88514920, + "step": 4112, + "time_per_iteration": 2.5548248291015625 + }, + { + "auxiliary_loss_clip": 0.01151468, + "auxiliary_loss_mlp": 0.01026444, + "balance_loss_clip": 1.05139184, + "balance_loss_mlp": 1.01839781, + "epoch": 0.49455900919858115, + "flos": 12347453859840.0, + "grad_norm": 2.153547646896936, + "language_loss": 0.76733214, + "learning_rate": 2.1323284402223666e-06, + "loss": 0.78911126, + "num_input_tokens_seen": 88530910, + "step": 4113, + "time_per_iteration": 2.4448986053466797 + }, + { + "auxiliary_loss_clip": 0.01182954, + "auxiliary_loss_mlp": 0.00761664, + "balance_loss_clip": 1.05851507, + "balance_loss_mlp": 1.00008357, + "epoch": 0.4946792520892202, + "flos": 22779610715520.0, + "grad_norm": 1.894227028815412, + "language_loss": 0.88390982, + "learning_rate": 2.1315511654814597e-06, + "loss": 0.90335602, + "num_input_tokens_seen": 88549320, + "step": 4114, + "time_per_iteration": 2.449101209640503 + }, + { + "auxiliary_loss_clip": 0.0114949, + "auxiliary_loss_mlp": 0.01024801, + "balance_loss_clip": 1.05316114, + "balance_loss_mlp": 1.01755619, + "epoch": 0.4947994949798593, + "flos": 23148126299520.0, + "grad_norm": 1.8064484786574229, + "language_loss": 0.78082669, + "learning_rate": 2.1307738707843456e-06, + "loss": 0.80256957, + "num_input_tokens_seen": 88568985, + "step": 4115, + "time_per_iteration": 2.5029611587524414 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.05641603, + "balance_loss_mlp": 1.01887, + "epoch": 0.4949197378704984, + "flos": 23659997063040.0, + "grad_norm": 1.9267952837255575, + "language_loss": 0.68972141, + "learning_rate": 2.1299965562489385e-06, + "loss": 0.71174514, + "num_input_tokens_seen": 88588790, + "step": 4116, + "time_per_iteration": 2.5004801750183105 + }, + { + "auxiliary_loss_clip": 0.01167111, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.05162835, + "balance_loss_mlp": 1.02229583, + "epoch": 0.4950399807611375, + "flos": 26911493026560.0, + "grad_norm": 1.404968374885798, + "language_loss": 0.79080629, + "learning_rate": 2.129219221993158e-06, + "loss": 0.81278175, + "num_input_tokens_seen": 88613575, + "step": 4117, + "time_per_iteration": 2.54606032371521 + }, + { + "auxiliary_loss_clip": 0.01057268, + "auxiliary_loss_mlp": 0.01006629, + "balance_loss_clip": 1.02741957, + "balance_loss_mlp": 1.00551462, + "epoch": 0.4951602236517766, + "flos": 67315270187520.0, + "grad_norm": 0.7990890443105685, + "language_loss": 0.59967893, + "learning_rate": 2.128441868134924e-06, + "loss": 0.62031788, + "num_input_tokens_seen": 88675510, + "step": 4118, + "time_per_iteration": 3.1416573524475098 + }, + { + "auxiliary_loss_clip": 0.01142471, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.04829574, + "balance_loss_mlp": 1.01922154, + "epoch": 0.4952804665424157, + "flos": 19901442758400.0, + "grad_norm": 2.0912139166573, + "language_loss": 0.82745439, + "learning_rate": 2.1276644947921606e-06, + "loss": 0.84915316, + "num_input_tokens_seen": 88694425, + "step": 4119, + "time_per_iteration": 2.5093836784362793 + }, + { + "auxiliary_loss_clip": 0.0116913, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.05257046, + "balance_loss_mlp": 1.01715124, + "epoch": 0.49540070943305475, + "flos": 18806813740800.0, + "grad_norm": 1.9239056445013087, + "language_loss": 0.8263731, + "learning_rate": 2.126887102082795e-06, + "loss": 0.84832501, + "num_input_tokens_seen": 88714450, + "step": 4120, + "time_per_iteration": 2.4562063217163086 + }, + { + "auxiliary_loss_clip": 0.0113999, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.04816628, + "balance_loss_mlp": 1.01959276, + "epoch": 0.49552095232369386, + "flos": 24934179191040.0, + "grad_norm": 1.9105544846739133, + "language_loss": 0.70732427, + "learning_rate": 2.126109690124757e-06, + "loss": 0.72899771, + "num_input_tokens_seen": 88735265, + "step": 4121, + "time_per_iteration": 2.5715651512145996 + }, + { + "auxiliary_loss_clip": 0.01126929, + "auxiliary_loss_mlp": 0.01027542, + "balance_loss_clip": 1.04761755, + "balance_loss_mlp": 1.01981711, + "epoch": 0.495641195214333, + "flos": 22857249962880.0, + "grad_norm": 1.6611266414500356, + "language_loss": 0.71172798, + "learning_rate": 2.1253322590359786e-06, + "loss": 0.73327267, + "num_input_tokens_seen": 88754600, + "step": 4122, + "time_per_iteration": 2.574528455734253 + }, + { + "auxiliary_loss_clip": 0.01166389, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.05131733, + "balance_loss_mlp": 1.02505851, + "epoch": 0.49576143810497203, + "flos": 25769748343680.0, + "grad_norm": 1.7271996843557909, + "language_loss": 0.73734379, + "learning_rate": 2.124554808934397e-06, + "loss": 0.75933522, + "num_input_tokens_seen": 88775180, + "step": 4123, + "time_per_iteration": 2.5132992267608643 + }, + { + "auxiliary_loss_clip": 0.01119559, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.04577088, + "balance_loss_mlp": 1.02005935, + "epoch": 0.49588168099561114, + "flos": 22128838058880.0, + "grad_norm": 1.8237209943990293, + "language_loss": 0.72868013, + "learning_rate": 2.1237773399379496e-06, + "loss": 0.75016212, + "num_input_tokens_seen": 88796145, + "step": 4124, + "time_per_iteration": 3.4027979373931885 + }, + { + "auxiliary_loss_clip": 0.01157245, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.04608119, + "balance_loss_mlp": 1.01640153, + "epoch": 0.49600192388625025, + "flos": 24387331559040.0, + "grad_norm": 9.384741885532225, + "language_loss": 0.8693943, + "learning_rate": 2.122999852164578e-06, + "loss": 0.89121872, + "num_input_tokens_seen": 88816765, + "step": 4125, + "time_per_iteration": 2.534491539001465 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01024233, + "balance_loss_clip": 1.04789424, + "balance_loss_mlp": 1.01600778, + "epoch": 0.4961221667768893, + "flos": 22857429530880.0, + "grad_norm": 2.3943681056355444, + "language_loss": 0.58202177, + "learning_rate": 2.122222345732227e-06, + "loss": 0.60350382, + "num_input_tokens_seen": 88836680, + "step": 4126, + "time_per_iteration": 3.4016165733337402 + }, + { + "auxiliary_loss_clip": 0.01141351, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.048908, + "balance_loss_mlp": 1.01952422, + "epoch": 0.4962424096675284, + "flos": 17858089768320.0, + "grad_norm": 2.678943307408686, + "language_loss": 0.83160228, + "learning_rate": 2.121444820758843e-06, + "loss": 0.85329306, + "num_input_tokens_seen": 88855320, + "step": 4127, + "time_per_iteration": 3.260526180267334 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.04996705, + "balance_loss_mlp": 1.02537668, + "epoch": 0.49636265255816747, + "flos": 21793611404160.0, + "grad_norm": 2.081068333847806, + "language_loss": 0.78742272, + "learning_rate": 2.120667277362376e-06, + "loss": 0.80901992, + "num_input_tokens_seen": 88874035, + "step": 4128, + "time_per_iteration": 2.5433716773986816 + }, + { + "auxiliary_loss_clip": 0.0118723, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.05742288, + "balance_loss_mlp": 1.02771091, + "epoch": 0.4964828954488066, + "flos": 16358603581440.0, + "grad_norm": 2.201434720448348, + "language_loss": 0.85090858, + "learning_rate": 2.1198897156607796e-06, + "loss": 0.87314433, + "num_input_tokens_seen": 88891390, + "step": 4129, + "time_per_iteration": 2.4152348041534424 + }, + { + "auxiliary_loss_clip": 0.01173403, + "auxiliary_loss_mlp": 0.01029267, + "balance_loss_clip": 1.05186415, + "balance_loss_mlp": 1.02113104, + "epoch": 0.4966031383394457, + "flos": 24711101775360.0, + "grad_norm": 2.9455213774524034, + "language_loss": 0.73813653, + "learning_rate": 2.1191121357720085e-06, + "loss": 0.76016319, + "num_input_tokens_seen": 88909450, + "step": 4130, + "time_per_iteration": 3.2572743892669678 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.04868948, + "balance_loss_mlp": 1.02118862, + "epoch": 0.49672338123008475, + "flos": 22930615491840.0, + "grad_norm": 1.6450711028103961, + "language_loss": 0.7504428, + "learning_rate": 2.1183345378140206e-06, + "loss": 0.77194291, + "num_input_tokens_seen": 88929195, + "step": 4131, + "time_per_iteration": 2.6694369316101074 + }, + { + "auxiliary_loss_clip": 0.01072872, + "auxiliary_loss_mlp": 0.01002259, + "balance_loss_clip": 1.01930702, + "balance_loss_mlp": 1.0008105, + "epoch": 0.49684362412072386, + "flos": 65976736844160.0, + "grad_norm": 0.8539052808092114, + "language_loss": 0.61984456, + "learning_rate": 2.1175569219047783e-06, + "loss": 0.64059579, + "num_input_tokens_seen": 88990635, + "step": 4132, + "time_per_iteration": 3.160383701324463 + }, + { + "auxiliary_loss_clip": 0.0118328, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.05444455, + "balance_loss_mlp": 1.01999497, + "epoch": 0.49696386701136297, + "flos": 19971288754560.0, + "grad_norm": 1.6719445703853393, + "language_loss": 0.73208475, + "learning_rate": 2.1167792881622437e-06, + "loss": 0.75419515, + "num_input_tokens_seen": 89009655, + "step": 4133, + "time_per_iteration": 2.4385061264038086 + }, + { + "auxiliary_loss_clip": 0.01153971, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.0534575, + "balance_loss_mlp": 1.02666879, + "epoch": 0.497084109902002, + "flos": 24750819239040.0, + "grad_norm": 1.7693898649399644, + "language_loss": 0.81001109, + "learning_rate": 2.116001636704384e-06, + "loss": 0.83189565, + "num_input_tokens_seen": 89030040, + "step": 4134, + "time_per_iteration": 2.5290114879608154 + }, + { + "auxiliary_loss_clip": 0.01138425, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.05056131, + "balance_loss_mlp": 1.02447033, + "epoch": 0.49720435279264114, + "flos": 21871825269120.0, + "grad_norm": 1.8235047463180554, + "language_loss": 0.80288315, + "learning_rate": 2.1152239676491685e-06, + "loss": 0.82459593, + "num_input_tokens_seen": 89048145, + "step": 4135, + "time_per_iteration": 2.5922958850860596 + }, + { + "auxiliary_loss_clip": 0.01159992, + "auxiliary_loss_mlp": 0.01025529, + "balance_loss_clip": 1.05013418, + "balance_loss_mlp": 1.0177151, + "epoch": 0.49732459568328025, + "flos": 23805794367360.0, + "grad_norm": 1.7542176573297985, + "language_loss": 0.73184544, + "learning_rate": 2.114446281114569e-06, + "loss": 0.75370061, + "num_input_tokens_seen": 89067165, + "step": 4136, + "time_per_iteration": 2.5260047912597656 + }, + { + "auxiliary_loss_clip": 0.01147251, + "auxiliary_loss_mlp": 0.01026575, + "balance_loss_clip": 1.05117249, + "balance_loss_mlp": 1.01786077, + "epoch": 0.4974448385739193, + "flos": 20047742853120.0, + "grad_norm": 1.9369203888230022, + "language_loss": 0.76220119, + "learning_rate": 2.1136685772185587e-06, + "loss": 0.78393948, + "num_input_tokens_seen": 89086190, + "step": 4137, + "time_per_iteration": 2.489786386489868 + }, + { + "auxiliary_loss_clip": 0.01152505, + "auxiliary_loss_mlp": 0.00763387, + "balance_loss_clip": 1.04498053, + "balance_loss_mlp": 1.0000608, + "epoch": 0.4975650814645584, + "flos": 24821347593600.0, + "grad_norm": 1.600728129345119, + "language_loss": 0.78087401, + "learning_rate": 2.1128908560791163e-06, + "loss": 0.80003291, + "num_input_tokens_seen": 89106020, + "step": 4138, + "time_per_iteration": 2.541379690170288 + }, + { + "auxiliary_loss_clip": 0.01184245, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.05584574, + "balance_loss_mlp": 1.02143252, + "epoch": 0.4976853243551975, + "flos": 19829477859840.0, + "grad_norm": 2.1474548819042285, + "language_loss": 0.78336895, + "learning_rate": 2.1121131178142203e-06, + "loss": 0.80550939, + "num_input_tokens_seen": 89125385, + "step": 4139, + "time_per_iteration": 2.4247288703918457 + }, + { + "auxiliary_loss_clip": 0.01156934, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.05096173, + "balance_loss_mlp": 1.01781678, + "epoch": 0.4978055672458366, + "flos": 23142990654720.0, + "grad_norm": 1.5999293712498972, + "language_loss": 0.82329381, + "learning_rate": 2.1113353625418544e-06, + "loss": 0.845119, + "num_input_tokens_seen": 89143935, + "step": 4140, + "time_per_iteration": 2.505744457244873 + }, + { + "auxiliary_loss_clip": 0.01162455, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.05359507, + "balance_loss_mlp": 1.02156019, + "epoch": 0.4979258101364757, + "flos": 15559914718080.0, + "grad_norm": 1.754613911575738, + "language_loss": 0.79090488, + "learning_rate": 2.1105575903800017e-06, + "loss": 0.81281954, + "num_input_tokens_seen": 89162655, + "step": 4141, + "time_per_iteration": 2.427483081817627 + }, + { + "auxiliary_loss_clip": 0.01173402, + "auxiliary_loss_mlp": 0.01026574, + "balance_loss_clip": 1.05192101, + "balance_loss_mlp": 1.01840806, + "epoch": 0.4980460530271148, + "flos": 26356169784960.0, + "grad_norm": 1.8894631223960248, + "language_loss": 0.85268092, + "learning_rate": 2.1097798014466502e-06, + "loss": 0.8746807, + "num_input_tokens_seen": 89182255, + "step": 4142, + "time_per_iteration": 2.484276533126831 + }, + { + "auxiliary_loss_clip": 0.01173964, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.05452919, + "balance_loss_mlp": 1.01988733, + "epoch": 0.49816629591775385, + "flos": 17274541415040.0, + "grad_norm": 2.074310481810634, + "language_loss": 0.5847829, + "learning_rate": 2.109001995859791e-06, + "loss": 0.606812, + "num_input_tokens_seen": 89201155, + "step": 4143, + "time_per_iteration": 2.4370336532592773 + }, + { + "auxiliary_loss_clip": 0.01061754, + "auxiliary_loss_mlp": 0.01003484, + "balance_loss_clip": 1.02219915, + "balance_loss_mlp": 1.0021069, + "epoch": 0.49828653880839296, + "flos": 64930947344640.0, + "grad_norm": 0.8104183401912067, + "language_loss": 0.60123158, + "learning_rate": 2.108224173737415e-06, + "loss": 0.62188399, + "num_input_tokens_seen": 89264455, + "step": 4144, + "time_per_iteration": 3.064518451690674 + }, + { + "auxiliary_loss_clip": 0.01150527, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.04804778, + "balance_loss_mlp": 1.02130914, + "epoch": 0.498406781699032, + "flos": 27484806003840.0, + "grad_norm": 3.0373829126736895, + "language_loss": 0.76386493, + "learning_rate": 2.1074463351975183e-06, + "loss": 0.78567308, + "num_input_tokens_seen": 89283340, + "step": 4145, + "time_per_iteration": 2.5412256717681885 + }, + { + "auxiliary_loss_clip": 0.01144585, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.04964185, + "balance_loss_mlp": 1.0148474, + "epoch": 0.49852702458967113, + "flos": 31499870307840.0, + "grad_norm": 1.6063259693606657, + "language_loss": 0.71589231, + "learning_rate": 2.106668480358098e-06, + "loss": 0.73756331, + "num_input_tokens_seen": 89303565, + "step": 4146, + "time_per_iteration": 2.6004159450531006 + }, + { + "auxiliary_loss_clip": 0.01150463, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.04723525, + "balance_loss_mlp": 1.01814604, + "epoch": 0.49864726748031024, + "flos": 22852868503680.0, + "grad_norm": 1.567746630007518, + "language_loss": 0.70865643, + "learning_rate": 2.105890609337154e-06, + "loss": 0.73043001, + "num_input_tokens_seen": 89322080, + "step": 4147, + "time_per_iteration": 2.5667285919189453 + }, + { + "auxiliary_loss_clip": 0.01083031, + "auxiliary_loss_mlp": 0.01001448, + "balance_loss_clip": 1.02080953, + "balance_loss_mlp": 1.00014901, + "epoch": 0.4987675103709493, + "flos": 70405708544640.0, + "grad_norm": 0.6878898786990562, + "language_loss": 0.63853502, + "learning_rate": 2.1051127222526883e-06, + "loss": 0.65937972, + "num_input_tokens_seen": 89394195, + "step": 4148, + "time_per_iteration": 3.1408419609069824 + }, + { + "auxiliary_loss_clip": 0.01168398, + "auxiliary_loss_mlp": 0.01023651, + "balance_loss_clip": 1.05565691, + "balance_loss_mlp": 1.01578307, + "epoch": 0.4988877532615884, + "flos": 28767571482240.0, + "grad_norm": 1.5412398137947618, + "language_loss": 0.8084116, + "learning_rate": 2.1043348192227067e-06, + "loss": 0.83033204, + "num_input_tokens_seen": 89414565, + "step": 4149, + "time_per_iteration": 2.5256338119506836 + }, + { + "auxiliary_loss_clip": 0.01129651, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.04869795, + "balance_loss_mlp": 1.02454877, + "epoch": 0.4990079961522275, + "flos": 16872700988160.0, + "grad_norm": 1.6977619615921062, + "language_loss": 0.61629057, + "learning_rate": 2.1035569003652156e-06, + "loss": 0.63791585, + "num_input_tokens_seen": 89433195, + "step": 4150, + "time_per_iteration": 3.3442342281341553 + }, + { + "auxiliary_loss_clip": 0.01123107, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.04685307, + "balance_loss_mlp": 1.02780664, + "epoch": 0.4991282390428666, + "flos": 13291042187520.0, + "grad_norm": 1.9188090522970949, + "language_loss": 0.81876165, + "learning_rate": 2.1027789657982255e-06, + "loss": 0.84036946, + "num_input_tokens_seen": 89447410, + "step": 4151, + "time_per_iteration": 2.5223541259765625 + }, + { + "auxiliary_loss_clip": 0.01127162, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.04864275, + "balance_loss_mlp": 1.02450156, + "epoch": 0.4992484819335057, + "flos": 21537496454400.0, + "grad_norm": 1.9915568983225955, + "language_loss": 0.7703954, + "learning_rate": 2.1020010156397482e-06, + "loss": 0.79199147, + "num_input_tokens_seen": 89464630, + "step": 4152, + "time_per_iteration": 2.558551788330078 + }, + { + "auxiliary_loss_clip": 0.01169351, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.05259895, + "balance_loss_mlp": 1.02545202, + "epoch": 0.4993687248241448, + "flos": 24860095390080.0, + "grad_norm": 1.4786463022889875, + "language_loss": 0.7766124, + "learning_rate": 2.101223050007797e-06, + "loss": 0.79864079, + "num_input_tokens_seen": 89483180, + "step": 4153, + "time_per_iteration": 3.3580849170684814 + }, + { + "auxiliary_loss_clip": 0.0108163, + "auxiliary_loss_mlp": 0.01001306, + "balance_loss_clip": 1.01904166, + "balance_loss_mlp": 0.99999505, + "epoch": 0.49948896771478385, + "flos": 62941602453120.0, + "grad_norm": 0.8207227722631733, + "language_loss": 0.53765684, + "learning_rate": 2.1004450690203904e-06, + "loss": 0.55848616, + "num_input_tokens_seen": 89539260, + "step": 4154, + "time_per_iteration": 3.8548424243927 + }, + { + "auxiliary_loss_clip": 0.01081326, + "auxiliary_loss_mlp": 0.01001137, + "balance_loss_clip": 1.01902056, + "balance_loss_mlp": 0.99988502, + "epoch": 0.49960921060542296, + "flos": 68284213516800.0, + "grad_norm": 0.8536652669784582, + "language_loss": 0.63358331, + "learning_rate": 2.099667072795546e-06, + "loss": 0.65440792, + "num_input_tokens_seen": 89601380, + "step": 4155, + "time_per_iteration": 3.084392547607422 + }, + { + "auxiliary_loss_clip": 0.01166483, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.05074334, + "balance_loss_mlp": 1.02206814, + "epoch": 0.49972945349606207, + "flos": 23659350618240.0, + "grad_norm": 2.3055516959853253, + "language_loss": 0.79888976, + "learning_rate": 2.0988890614512864e-06, + "loss": 0.82086068, + "num_input_tokens_seen": 89621270, + "step": 4156, + "time_per_iteration": 2.5225977897644043 + }, + { + "auxiliary_loss_clip": 0.01159267, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.05502033, + "balance_loss_mlp": 1.02136457, + "epoch": 0.4998496963867011, + "flos": 19755825022080.0, + "grad_norm": 1.6344915540691656, + "language_loss": 0.84120947, + "learning_rate": 2.098111035105635e-06, + "loss": 0.86309636, + "num_input_tokens_seen": 89639695, + "step": 4157, + "time_per_iteration": 3.2427964210510254 + }, + { + "auxiliary_loss_clip": 0.01126301, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.05217063, + "balance_loss_mlp": 1.02112246, + "epoch": 0.49996993927734024, + "flos": 22265728790400.0, + "grad_norm": 1.7160573327998834, + "language_loss": 0.73278666, + "learning_rate": 2.0973329938766176e-06, + "loss": 0.75434065, + "num_input_tokens_seen": 89657125, + "step": 4158, + "time_per_iteration": 2.596588134765625 + }, + { + "auxiliary_loss_clip": 0.01165641, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.05397642, + "balance_loss_mlp": 1.02487493, + "epoch": 0.5000901821679793, + "flos": 23327212533120.0, + "grad_norm": 1.9087083097330289, + "language_loss": 0.78850073, + "learning_rate": 2.0965549378822618e-06, + "loss": 0.81049168, + "num_input_tokens_seen": 89678415, + "step": 4159, + "time_per_iteration": 2.580095052719116 + }, + { + "auxiliary_loss_clip": 0.01091393, + "auxiliary_loss_mlp": 0.01026239, + "balance_loss_clip": 1.04507875, + "balance_loss_mlp": 1.01816249, + "epoch": 0.5002104250586185, + "flos": 20339014239360.0, + "grad_norm": 1.9060454826206714, + "language_loss": 0.83911598, + "learning_rate": 2.095776867240599e-06, + "loss": 0.8602922, + "num_input_tokens_seen": 89695405, + "step": 4160, + "time_per_iteration": 2.6216256618499756 + }, + { + "auxiliary_loss_clip": 0.01133653, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.0473839, + "balance_loss_mlp": 1.02227426, + "epoch": 0.5003306679492575, + "flos": 13991372634240.0, + "grad_norm": 1.845389233110712, + "language_loss": 0.82811105, + "learning_rate": 2.094998782069661e-06, + "loss": 0.84974384, + "num_input_tokens_seen": 89713110, + "step": 4161, + "time_per_iteration": 2.5136258602142334 + }, + { + "auxiliary_loss_clip": 0.01183639, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.05594206, + "balance_loss_mlp": 1.01898742, + "epoch": 0.5004509108398966, + "flos": 27672762896640.0, + "grad_norm": 1.7248011203696096, + "language_loss": 0.75443238, + "learning_rate": 2.0942206824874845e-06, + "loss": 0.77654046, + "num_input_tokens_seen": 89735885, + "step": 4162, + "time_per_iteration": 2.490328550338745 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01027891, + "balance_loss_clip": 1.05594885, + "balance_loss_mlp": 1.0193975, + "epoch": 0.5005711537305357, + "flos": 14976186796800.0, + "grad_norm": 2.1625795840130104, + "language_loss": 0.79006231, + "learning_rate": 2.093442568612105e-06, + "loss": 0.81204152, + "num_input_tokens_seen": 89753690, + "step": 4163, + "time_per_iteration": 2.4314751625061035 + }, + { + "auxiliary_loss_clip": 0.01182263, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.05275989, + "balance_loss_mlp": 1.0171802, + "epoch": 0.5006913966211748, + "flos": 26503259978880.0, + "grad_norm": 1.4703431591048903, + "language_loss": 0.85319805, + "learning_rate": 2.0926644405615613e-06, + "loss": 0.87527406, + "num_input_tokens_seen": 89774590, + "step": 4164, + "time_per_iteration": 2.4616031646728516 + }, + { + "auxiliary_loss_clip": 0.01133387, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.04901135, + "balance_loss_mlp": 1.02048099, + "epoch": 0.5008116395118138, + "flos": 20449295971200.0, + "grad_norm": 1.8117700571286253, + "language_loss": 0.81258881, + "learning_rate": 2.091886298453897e-06, + "loss": 0.83420658, + "num_input_tokens_seen": 89792775, + "step": 4165, + "time_per_iteration": 2.5090725421905518 + }, + { + "auxiliary_loss_clip": 0.01166732, + "auxiliary_loss_mlp": 0.01024058, + "balance_loss_clip": 1.05182672, + "balance_loss_mlp": 1.01641965, + "epoch": 0.500931882402453, + "flos": 21579871524480.0, + "grad_norm": 2.1610045685585133, + "language_loss": 0.73066109, + "learning_rate": 2.091108142407153e-06, + "loss": 0.75256902, + "num_input_tokens_seen": 89811515, + "step": 4166, + "time_per_iteration": 2.4603617191314697 + }, + { + "auxiliary_loss_clip": 0.01058418, + "auxiliary_loss_mlp": 0.01003957, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.00227606, + "epoch": 0.5010521252930921, + "flos": 57785011925760.0, + "grad_norm": 0.8383630926993236, + "language_loss": 0.62405729, + "learning_rate": 2.090329972539377e-06, + "loss": 0.64468098, + "num_input_tokens_seen": 89870080, + "step": 4167, + "time_per_iteration": 3.124333143234253 + }, + { + "auxiliary_loss_clip": 0.01092424, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.04495597, + "balance_loss_mlp": 1.02255905, + "epoch": 0.5011723681837311, + "flos": 18625500864000.0, + "grad_norm": 1.7024647686693317, + "language_loss": 0.68343723, + "learning_rate": 2.089551788968616e-06, + "loss": 0.70466501, + "num_input_tokens_seen": 89888045, + "step": 4168, + "time_per_iteration": 2.5765953063964844 + }, + { + "auxiliary_loss_clip": 0.0107902, + "auxiliary_loss_mlp": 0.01003842, + "balance_loss_clip": 1.01689529, + "balance_loss_mlp": 1.00264382, + "epoch": 0.5012926110743702, + "flos": 55883146608000.0, + "grad_norm": 0.834716188739814, + "language_loss": 0.60811985, + "learning_rate": 2.08877359181292e-06, + "loss": 0.62894845, + "num_input_tokens_seen": 89944610, + "step": 4169, + "time_per_iteration": 2.985989809036255 + }, + { + "auxiliary_loss_clip": 0.01142338, + "auxiliary_loss_mlp": 0.01026952, + "balance_loss_clip": 1.04551339, + "balance_loss_mlp": 1.01925445, + "epoch": 0.5014128539650093, + "flos": 24238266117120.0, + "grad_norm": 2.4386148547423683, + "language_loss": 0.85715073, + "learning_rate": 2.0879953811903396e-06, + "loss": 0.87884367, + "num_input_tokens_seen": 89959495, + "step": 4170, + "time_per_iteration": 2.5322375297546387 + }, + { + "auxiliary_loss_clip": 0.01167103, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.0531826, + "balance_loss_mlp": 1.02312195, + "epoch": 0.5015330968556484, + "flos": 27527468382720.0, + "grad_norm": 1.7939722933105977, + "language_loss": 0.7837528, + "learning_rate": 2.08721715721893e-06, + "loss": 0.80573595, + "num_input_tokens_seen": 89978820, + "step": 4171, + "time_per_iteration": 2.5158846378326416 + }, + { + "auxiliary_loss_clip": 0.01167839, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.05332685, + "balance_loss_mlp": 1.0225141, + "epoch": 0.5016533397462875, + "flos": 23800802376960.0, + "grad_norm": 1.747416038381673, + "language_loss": 0.76668239, + "learning_rate": 2.0864389200167477e-06, + "loss": 0.78866565, + "num_input_tokens_seen": 89997075, + "step": 4172, + "time_per_iteration": 2.47259521484375 + }, + { + "auxiliary_loss_clip": 0.01170802, + "auxiliary_loss_mlp": 0.00762675, + "balance_loss_clip": 1.05285501, + "balance_loss_mlp": 1.00014329, + "epoch": 0.5017735826369266, + "flos": 25295009264640.0, + "grad_norm": 1.919798091280079, + "language_loss": 0.78957605, + "learning_rate": 2.0856606697018504e-06, + "loss": 0.80891085, + "num_input_tokens_seen": 90015085, + "step": 4173, + "time_per_iteration": 2.4933574199676514 + }, + { + "auxiliary_loss_clip": 0.01149911, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.04977274, + "balance_loss_mlp": 1.02165246, + "epoch": 0.5018938255275657, + "flos": 16873203778560.0, + "grad_norm": 2.8500797342025574, + "language_loss": 0.72968054, + "learning_rate": 2.084882406392297e-06, + "loss": 0.75147927, + "num_input_tokens_seen": 90033045, + "step": 4174, + "time_per_iteration": 2.478384256362915 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.05123162, + "balance_loss_mlp": 1.01927614, + "epoch": 0.5020140684182047, + "flos": 25515429073920.0, + "grad_norm": 2.494079964511907, + "language_loss": 0.70868623, + "learning_rate": 2.0841041302061496e-06, + "loss": 0.7304399, + "num_input_tokens_seen": 90052505, + "step": 4175, + "time_per_iteration": 2.5475215911865234 + }, + { + "auxiliary_loss_clip": 0.01142428, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.04672444, + "balance_loss_mlp": 1.02592075, + "epoch": 0.5021343113088439, + "flos": 23659278791040.0, + "grad_norm": 1.9479204045875718, + "language_loss": 0.75687671, + "learning_rate": 2.083325841261473e-06, + "loss": 0.77864051, + "num_input_tokens_seen": 90071565, + "step": 4176, + "time_per_iteration": 2.512023687362671 + }, + { + "auxiliary_loss_clip": 0.01145399, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.04672837, + "balance_loss_mlp": 1.01819158, + "epoch": 0.502254554199483, + "flos": 24534673148160.0, + "grad_norm": 2.429520816426436, + "language_loss": 0.66624248, + "learning_rate": 2.0825475396763322e-06, + "loss": 0.68795645, + "num_input_tokens_seen": 90092215, + "step": 4177, + "time_per_iteration": 3.3415985107421875 + }, + { + "auxiliary_loss_clip": 0.01098364, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.04720092, + "balance_loss_mlp": 1.02111149, + "epoch": 0.502374797090122, + "flos": 34240285607040.0, + "grad_norm": 1.3806383394855086, + "language_loss": 0.65732622, + "learning_rate": 2.081769225568796e-06, + "loss": 0.67860615, + "num_input_tokens_seen": 90114665, + "step": 4178, + "time_per_iteration": 2.730598211288452 + }, + { + "auxiliary_loss_clip": 0.01168322, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.04987943, + "balance_loss_mlp": 1.02526331, + "epoch": 0.5024950399807612, + "flos": 26031106679040.0, + "grad_norm": 1.489354769754311, + "language_loss": 0.75870955, + "learning_rate": 2.0809908990569327e-06, + "loss": 0.78073239, + "num_input_tokens_seen": 90136445, + "step": 4179, + "time_per_iteration": 3.362760543823242 + }, + { + "auxiliary_loss_clip": 0.01153864, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.05078077, + "balance_loss_mlp": 1.02071762, + "epoch": 0.5026152828714002, + "flos": 21252438120960.0, + "grad_norm": 1.6577920262841377, + "language_loss": 0.78895926, + "learning_rate": 2.0802125602588146e-06, + "loss": 0.81078947, + "num_input_tokens_seen": 90155710, + "step": 4180, + "time_per_iteration": 3.326996088027954 + }, + { + "auxiliary_loss_clip": 0.0118223, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.05484164, + "balance_loss_mlp": 1.02550054, + "epoch": 0.5027355257620393, + "flos": 30956111245440.0, + "grad_norm": 1.8507698758535343, + "language_loss": 0.66382253, + "learning_rate": 2.0794342092925146e-06, + "loss": 0.68598163, + "num_input_tokens_seen": 90176845, + "step": 4181, + "time_per_iteration": 2.507242441177368 + }, + { + "auxiliary_loss_clip": 0.01172468, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.05574775, + "balance_loss_mlp": 1.02333903, + "epoch": 0.5028557686526784, + "flos": 24791147233920.0, + "grad_norm": 1.928388142887104, + "language_loss": 0.68008, + "learning_rate": 2.078655846276108e-06, + "loss": 0.70211977, + "num_input_tokens_seen": 90197175, + "step": 4182, + "time_per_iteration": 2.4964892864227295 + }, + { + "auxiliary_loss_clip": 0.01148994, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.05040455, + "balance_loss_mlp": 1.01983976, + "epoch": 0.5029760115433175, + "flos": 22966992990720.0, + "grad_norm": 2.137462344561256, + "language_loss": 0.68710399, + "learning_rate": 2.0778774713276727e-06, + "loss": 0.70887524, + "num_input_tokens_seen": 90216650, + "step": 4183, + "time_per_iteration": 3.2987024784088135 + }, + { + "auxiliary_loss_clip": 0.01164778, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.04963803, + "balance_loss_mlp": 1.02106857, + "epoch": 0.5030962544339566, + "flos": 15305164485120.0, + "grad_norm": 2.1077179033705704, + "language_loss": 0.68073726, + "learning_rate": 2.077099084565287e-06, + "loss": 0.70268095, + "num_input_tokens_seen": 90234055, + "step": 4184, + "time_per_iteration": 2.4242732524871826 + }, + { + "auxiliary_loss_clip": 0.01147657, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0478301, + "balance_loss_mlp": 1.02064419, + "epoch": 0.5032164973245957, + "flos": 24494847943680.0, + "grad_norm": 2.3040838560828556, + "language_loss": 0.65544063, + "learning_rate": 2.0763206861070313e-06, + "loss": 0.67720449, + "num_input_tokens_seen": 90253115, + "step": 4185, + "time_per_iteration": 2.5097901821136475 + }, + { + "auxiliary_loss_clip": 0.01185076, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.05550051, + "balance_loss_mlp": 1.02372706, + "epoch": 0.5033367402152348, + "flos": 16213452721920.0, + "grad_norm": 2.0072480334691063, + "language_loss": 0.75391686, + "learning_rate": 2.0755422760709876e-06, + "loss": 0.77609015, + "num_input_tokens_seen": 90270515, + "step": 4186, + "time_per_iteration": 2.399305820465088 + }, + { + "auxiliary_loss_clip": 0.01119322, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.04549003, + "balance_loss_mlp": 1.024845, + "epoch": 0.5034569831058738, + "flos": 21391375927680.0, + "grad_norm": 1.9119030243796675, + "language_loss": 0.76888931, + "learning_rate": 2.0747638545752417e-06, + "loss": 0.79041493, + "num_input_tokens_seen": 90289075, + "step": 4187, + "time_per_iteration": 2.5306806564331055 + }, + { + "auxiliary_loss_clip": 0.01154907, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.05417061, + "balance_loss_mlp": 1.01818991, + "epoch": 0.503577225996513, + "flos": 20558751690240.0, + "grad_norm": 2.0261212518530884, + "language_loss": 0.83200598, + "learning_rate": 2.073985421737878e-06, + "loss": 0.85381806, + "num_input_tokens_seen": 90306385, + "step": 4188, + "time_per_iteration": 2.4842543601989746 + }, + { + "auxiliary_loss_clip": 0.01171887, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.05360103, + "balance_loss_mlp": 1.01855719, + "epoch": 0.5036974688871521, + "flos": 27229157930880.0, + "grad_norm": 2.016543792409451, + "language_loss": 0.74444741, + "learning_rate": 2.0732069776769844e-06, + "loss": 0.76643234, + "num_input_tokens_seen": 90323795, + "step": 4189, + "time_per_iteration": 2.517836570739746 + }, + { + "auxiliary_loss_clip": 0.01184411, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.05654585, + "balance_loss_mlp": 1.02113903, + "epoch": 0.5038177117777911, + "flos": 20412164286720.0, + "grad_norm": 2.0890636437834433, + "language_loss": 0.73028249, + "learning_rate": 2.072428522510651e-06, + "loss": 0.75242567, + "num_input_tokens_seen": 90340360, + "step": 4190, + "time_per_iteration": 2.4159305095672607 + }, + { + "auxiliary_loss_clip": 0.01133694, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.04952073, + "balance_loss_mlp": 1.02043819, + "epoch": 0.5039379546684303, + "flos": 21907987286400.0, + "grad_norm": 2.1990393357000757, + "language_loss": 0.76394939, + "learning_rate": 2.071650056356968e-06, + "loss": 0.78556764, + "num_input_tokens_seen": 90357900, + "step": 4191, + "time_per_iteration": 2.5060040950775146 + }, + { + "auxiliary_loss_clip": 0.01181761, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.05405402, + "balance_loss_mlp": 1.02518511, + "epoch": 0.5040581975590693, + "flos": 20010718909440.0, + "grad_norm": 1.872558425835631, + "language_loss": 0.79949474, + "learning_rate": 2.070871579334028e-06, + "loss": 0.82164288, + "num_input_tokens_seen": 90377010, + "step": 4192, + "time_per_iteration": 2.428006887435913 + }, + { + "auxiliary_loss_clip": 0.01180101, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.05277407, + "balance_loss_mlp": 1.01966202, + "epoch": 0.5041784404497084, + "flos": 20959837931520.0, + "grad_norm": 3.869194947901325, + "language_loss": 0.71947128, + "learning_rate": 2.0700930915599264e-06, + "loss": 0.74154913, + "num_input_tokens_seen": 90396740, + "step": 4193, + "time_per_iteration": 2.4468986988067627 + }, + { + "auxiliary_loss_clip": 0.01182096, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.05380321, + "balance_loss_mlp": 1.02240825, + "epoch": 0.5042986833403476, + "flos": 12495082757760.0, + "grad_norm": 1.9991825328208133, + "language_loss": 0.78426707, + "learning_rate": 2.0693145931527583e-06, + "loss": 0.80639184, + "num_input_tokens_seen": 90413220, + "step": 4194, + "time_per_iteration": 2.3959057331085205 + }, + { + "auxiliary_loss_clip": 0.01148845, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.0500226, + "balance_loss_mlp": 1.0230068, + "epoch": 0.5044189262309866, + "flos": 29202305788800.0, + "grad_norm": 1.5362573874502132, + "language_loss": 0.7786479, + "learning_rate": 2.068536084230622e-06, + "loss": 0.8004477, + "num_input_tokens_seen": 90435085, + "step": 4195, + "time_per_iteration": 2.545450210571289 + }, + { + "auxiliary_loss_clip": 0.01168461, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.05380082, + "balance_loss_mlp": 1.02468359, + "epoch": 0.5045391691216257, + "flos": 23873198238720.0, + "grad_norm": 1.9581666163537965, + "language_loss": 0.88449043, + "learning_rate": 2.067757564911616e-06, + "loss": 0.90651625, + "num_input_tokens_seen": 90453660, + "step": 4196, + "time_per_iteration": 2.4743988513946533 + }, + { + "auxiliary_loss_clip": 0.01160993, + "auxiliary_loss_mlp": 0.00763363, + "balance_loss_clip": 1.05141652, + "balance_loss_mlp": 1.00016475, + "epoch": 0.5046594120122648, + "flos": 24644990793600.0, + "grad_norm": 3.0464101752092607, + "language_loss": 0.92837322, + "learning_rate": 2.0669790353138407e-06, + "loss": 0.9476167, + "num_input_tokens_seen": 90472625, + "step": 4197, + "time_per_iteration": 2.523627281188965 + }, + { + "auxiliary_loss_clip": 0.01136121, + "auxiliary_loss_mlp": 0.00763012, + "balance_loss_clip": 1.05122519, + "balance_loss_mlp": 1.00025356, + "epoch": 0.5047796549029039, + "flos": 23362835846400.0, + "grad_norm": 2.06997513878877, + "language_loss": 0.73087668, + "learning_rate": 2.0662004955553995e-06, + "loss": 0.74986798, + "num_input_tokens_seen": 90492325, + "step": 4198, + "time_per_iteration": 2.530907154083252 + }, + { + "auxiliary_loss_clip": 0.01148733, + "auxiliary_loss_mlp": 0.01023642, + "balance_loss_clip": 1.04856849, + "balance_loss_mlp": 1.01592374, + "epoch": 0.5048998977935429, + "flos": 17304095329920.0, + "grad_norm": 1.9088304713640585, + "language_loss": 0.76845455, + "learning_rate": 2.065421945754395e-06, + "loss": 0.7901783, + "num_input_tokens_seen": 90510055, + "step": 4199, + "time_per_iteration": 2.4702606201171875 + }, + { + "auxiliary_loss_clip": 0.01127468, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.05005312, + "balance_loss_mlp": 1.01991773, + "epoch": 0.505020140684182, + "flos": 34856979235200.0, + "grad_norm": 1.592849459414632, + "language_loss": 0.78183734, + "learning_rate": 2.0646433860289344e-06, + "loss": 0.80338645, + "num_input_tokens_seen": 90528980, + "step": 4200, + "time_per_iteration": 2.6616246700286865 + }, + { + "auxiliary_loss_clip": 0.01172984, + "auxiliary_loss_mlp": 0.00763655, + "balance_loss_clip": 1.05288565, + "balance_loss_mlp": 1.00022566, + "epoch": 0.5051403835748212, + "flos": 24863974058880.0, + "grad_norm": 2.2449877647319285, + "language_loss": 0.82668477, + "learning_rate": 2.0638648164971233e-06, + "loss": 0.84605116, + "num_input_tokens_seen": 90547445, + "step": 4201, + "time_per_iteration": 2.486485242843628 + }, + { + "auxiliary_loss_clip": 0.01152563, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.05217338, + "balance_loss_mlp": 1.02192688, + "epoch": 0.5052606264654602, + "flos": 20959694277120.0, + "grad_norm": 1.8321090215994578, + "language_loss": 0.88836199, + "learning_rate": 2.06308623727707e-06, + "loss": 0.91018283, + "num_input_tokens_seen": 90567545, + "step": 4202, + "time_per_iteration": 2.488079071044922 + }, + { + "auxiliary_loss_clip": 0.01162169, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.0506134, + "balance_loss_mlp": 1.01898205, + "epoch": 0.5053808693560993, + "flos": 19642382893440.0, + "grad_norm": 2.260185856820954, + "language_loss": 0.76533687, + "learning_rate": 2.0623076484868846e-06, + "loss": 0.78723413, + "num_input_tokens_seen": 90585000, + "step": 4203, + "time_per_iteration": 3.2754476070404053 + }, + { + "auxiliary_loss_clip": 0.01057443, + "auxiliary_loss_mlp": 0.01004305, + "balance_loss_clip": 1.02130198, + "balance_loss_mlp": 1.00317848, + "epoch": 0.5055011122467384, + "flos": 67504915019520.0, + "grad_norm": 0.8469472341861977, + "language_loss": 0.60696357, + "learning_rate": 2.061529050244679e-06, + "loss": 0.627581, + "num_input_tokens_seen": 90644745, + "step": 4204, + "time_per_iteration": 3.034193515777588 + }, + { + "auxiliary_loss_clip": 0.0114668, + "auxiliary_loss_mlp": 0.01022801, + "balance_loss_clip": 1.05068588, + "balance_loss_mlp": 1.01427186, + "epoch": 0.5056213551373775, + "flos": 16872952383360.0, + "grad_norm": 2.2513044406990224, + "language_loss": 0.74369597, + "learning_rate": 2.060750442668565e-06, + "loss": 0.76539075, + "num_input_tokens_seen": 90662500, + "step": 4205, + "time_per_iteration": 2.5891470909118652 + }, + { + "auxiliary_loss_clip": 0.01169881, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.05556941, + "balance_loss_mlp": 1.02191257, + "epoch": 0.5057415980280165, + "flos": 15334179696000.0, + "grad_norm": 2.49031724293243, + "language_loss": 0.64106423, + "learning_rate": 2.059971825876657e-06, + "loss": 0.66306591, + "num_input_tokens_seen": 90677010, + "step": 4206, + "time_per_iteration": 2.414470672607422 + }, + { + "auxiliary_loss_clip": 0.01170493, + "auxiliary_loss_mlp": 0.01026891, + "balance_loss_clip": 1.05423498, + "balance_loss_mlp": 1.01861501, + "epoch": 0.5058618409186557, + "flos": 19025976574080.0, + "grad_norm": 1.9791528682941184, + "language_loss": 0.7663753, + "learning_rate": 2.0591931999870713e-06, + "loss": 0.78834915, + "num_input_tokens_seen": 90695935, + "step": 4207, + "time_per_iteration": 4.025703191757202 + }, + { + "auxiliary_loss_clip": 0.01065987, + "auxiliary_loss_mlp": 0.01001798, + "balance_loss_clip": 1.0204407, + "balance_loss_mlp": 1.00072539, + "epoch": 0.5059820838092948, + "flos": 63453114080640.0, + "grad_norm": 0.8289903686563834, + "language_loss": 0.57591641, + "learning_rate": 2.0584145651179234e-06, + "loss": 0.59659427, + "num_input_tokens_seen": 90751645, + "step": 4208, + "time_per_iteration": 3.039763927459717 + }, + { + "auxiliary_loss_clip": 0.01155303, + "auxiliary_loss_mlp": 0.00762425, + "balance_loss_clip": 1.05463147, + "balance_loss_mlp": 1.00016546, + "epoch": 0.5061023266999338, + "flos": 15441803821440.0, + "grad_norm": 2.5622089982981513, + "language_loss": 0.80391634, + "learning_rate": 2.0576359213873327e-06, + "loss": 0.82309365, + "num_input_tokens_seen": 90766795, + "step": 4209, + "time_per_iteration": 2.4564402103424072 + }, + { + "auxiliary_loss_clip": 0.01160104, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.04809022, + "balance_loss_mlp": 1.02016318, + "epoch": 0.506222569590573, + "flos": 22451063990400.0, + "grad_norm": 4.534793872075395, + "language_loss": 0.70908451, + "learning_rate": 2.056857268913419e-06, + "loss": 0.73097014, + "num_input_tokens_seen": 90786845, + "step": 4210, + "time_per_iteration": 3.2950143814086914 + }, + { + "auxiliary_loss_clip": 0.01169104, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.05562043, + "balance_loss_mlp": 1.0197382, + "epoch": 0.506342812481212, + "flos": 17558665994880.0, + "grad_norm": 2.105991022313912, + "language_loss": 0.83813787, + "learning_rate": 2.056078607814303e-06, + "loss": 0.86010408, + "num_input_tokens_seen": 90802630, + "step": 4211, + "time_per_iteration": 2.411654233932495 + }, + { + "auxiliary_loss_clip": 0.01168489, + "auxiliary_loss_mlp": 0.01023847, + "balance_loss_clip": 1.05369461, + "balance_loss_mlp": 1.01562178, + "epoch": 0.5064630553718511, + "flos": 23402050519680.0, + "grad_norm": 3.9351652022734696, + "language_loss": 0.78508145, + "learning_rate": 2.055299938208106e-06, + "loss": 0.80700481, + "num_input_tokens_seen": 90823620, + "step": 4212, + "time_per_iteration": 2.4986555576324463 + }, + { + "auxiliary_loss_clip": 0.01174589, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.05630445, + "balance_loss_mlp": 1.02319312, + "epoch": 0.5065832982624903, + "flos": 23987035416960.0, + "grad_norm": 1.5746707234394322, + "language_loss": 0.86405993, + "learning_rate": 2.0545212602129526e-06, + "loss": 0.88612157, + "num_input_tokens_seen": 90843475, + "step": 4213, + "time_per_iteration": 2.480971574783325 + }, + { + "auxiliary_loss_clip": 0.01143534, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.04665661, + "balance_loss_mlp": 1.02359986, + "epoch": 0.5067035411531293, + "flos": 21503058289920.0, + "grad_norm": 2.0019384238611893, + "language_loss": 0.66305214, + "learning_rate": 2.0537425739469673e-06, + "loss": 0.68481135, + "num_input_tokens_seen": 90862410, + "step": 4214, + "time_per_iteration": 2.48228120803833 + }, + { + "auxiliary_loss_clip": 0.01074146, + "auxiliary_loss_mlp": 0.01002184, + "balance_loss_clip": 1.02130651, + "balance_loss_mlp": 1.00107503, + "epoch": 0.5068237840437684, + "flos": 65934397687680.0, + "grad_norm": 0.8419990211827253, + "language_loss": 0.59471959, + "learning_rate": 2.052963879528276e-06, + "loss": 0.61548293, + "num_input_tokens_seen": 90922280, + "step": 4215, + "time_per_iteration": 3.0256857872009277 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.0102373, + "balance_loss_clip": 1.05455613, + "balance_loss_mlp": 1.0157249, + "epoch": 0.5069440269344075, + "flos": 27264206626560.0, + "grad_norm": 2.0111282731624147, + "language_loss": 0.76598889, + "learning_rate": 2.052185177075007e-06, + "loss": 0.787925, + "num_input_tokens_seen": 90941850, + "step": 4216, + "time_per_iteration": 2.5299932956695557 + }, + { + "auxiliary_loss_clip": 0.0117132, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.05420351, + "balance_loss_mlp": 1.02381301, + "epoch": 0.5070642698250466, + "flos": 23366319465600.0, + "grad_norm": 1.6919714070192025, + "language_loss": 0.8306703, + "learning_rate": 2.051406466705288e-06, + "loss": 0.85269845, + "num_input_tokens_seen": 90961390, + "step": 4217, + "time_per_iteration": 2.4793636798858643 + }, + { + "auxiliary_loss_clip": 0.01180918, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.05301011, + "balance_loss_mlp": 1.01961899, + "epoch": 0.5071845127156857, + "flos": 20340127560960.0, + "grad_norm": 1.829978519391727, + "language_loss": 0.81053251, + "learning_rate": 2.0506277485372486e-06, + "loss": 0.83261639, + "num_input_tokens_seen": 90980215, + "step": 4218, + "time_per_iteration": 2.4224464893341064 + }, + { + "auxiliary_loss_clip": 0.01164191, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.05398405, + "balance_loss_mlp": 1.02469254, + "epoch": 0.5073047556063248, + "flos": 12092955022080.0, + "grad_norm": 1.8871532714754766, + "language_loss": 0.67037475, + "learning_rate": 2.04984902268902e-06, + "loss": 0.69234252, + "num_input_tokens_seen": 90997415, + "step": 4219, + "time_per_iteration": 2.4345171451568604 + }, + { + "auxiliary_loss_clip": 0.01172974, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.05083847, + "balance_loss_mlp": 1.02015352, + "epoch": 0.5074249984969639, + "flos": 19682854542720.0, + "grad_norm": 2.601554207508566, + "language_loss": 0.75460339, + "learning_rate": 2.0490702892787345e-06, + "loss": 0.77662605, + "num_input_tokens_seen": 91016475, + "step": 4220, + "time_per_iteration": 2.4609060287475586 + }, + { + "auxiliary_loss_clip": 0.01160354, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.05006528, + "balance_loss_mlp": 1.02389121, + "epoch": 0.5075452413876029, + "flos": 28765703975040.0, + "grad_norm": 2.192762152780997, + "language_loss": 0.62366754, + "learning_rate": 2.0482915484245246e-06, + "loss": 0.64559019, + "num_input_tokens_seen": 91038095, + "step": 4221, + "time_per_iteration": 2.505283832550049 + }, + { + "auxiliary_loss_clip": 0.01117755, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.04777181, + "balance_loss_mlp": 1.02243543, + "epoch": 0.5076654842782421, + "flos": 20339445202560.0, + "grad_norm": 2.369725711986272, + "language_loss": 0.84143519, + "learning_rate": 2.047512800244526e-06, + "loss": 0.86292434, + "num_input_tokens_seen": 91053360, + "step": 4222, + "time_per_iteration": 2.531452178955078 + }, + { + "auxiliary_loss_clip": 0.01167282, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.05313158, + "balance_loss_mlp": 1.01694572, + "epoch": 0.5077857271688812, + "flos": 26359653404160.0, + "grad_norm": 3.7636683653919003, + "language_loss": 0.78961837, + "learning_rate": 2.046734044856873e-06, + "loss": 0.81154358, + "num_input_tokens_seen": 91072770, + "step": 4223, + "time_per_iteration": 2.49385142326355 + }, + { + "auxiliary_loss_clip": 0.01167541, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.05351055, + "balance_loss_mlp": 1.02124691, + "epoch": 0.5079059700595202, + "flos": 21798962530560.0, + "grad_norm": 1.897913170136811, + "language_loss": 0.81250632, + "learning_rate": 2.045955282379702e-06, + "loss": 0.83447206, + "num_input_tokens_seen": 91091430, + "step": 4224, + "time_per_iteration": 2.4464786052703857 + }, + { + "auxiliary_loss_clip": 0.01164505, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.04941392, + "balance_loss_mlp": 1.02194023, + "epoch": 0.5080262129501594, + "flos": 13187943175680.0, + "grad_norm": 2.6944983267968388, + "language_loss": 0.76109529, + "learning_rate": 2.045176512931152e-06, + "loss": 0.78304422, + "num_input_tokens_seen": 91106060, + "step": 4225, + "time_per_iteration": 2.4173262119293213 + }, + { + "auxiliary_loss_clip": 0.01143044, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.04974222, + "balance_loss_mlp": 1.01703537, + "epoch": 0.5081464558407984, + "flos": 25301473712640.0, + "grad_norm": 2.337237949388475, + "language_loss": 0.76053709, + "learning_rate": 2.0443977366293604e-06, + "loss": 0.78221506, + "num_input_tokens_seen": 91124100, + "step": 4226, + "time_per_iteration": 2.541816473007202 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04539144, + "balance_loss_mlp": 1.02336955, + "epoch": 0.5082666987314375, + "flos": 30951226995840.0, + "grad_norm": 1.6492625686937972, + "language_loss": 0.76927078, + "learning_rate": 2.043618953592468e-06, + "loss": 0.79070437, + "num_input_tokens_seen": 91146555, + "step": 4227, + "time_per_iteration": 2.661424160003662 + }, + { + "auxiliary_loss_clip": 0.01155349, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.05310082, + "balance_loss_mlp": 1.02155852, + "epoch": 0.5083869416220766, + "flos": 19682495406720.0, + "grad_norm": 1.5038381746403708, + "language_loss": 0.81146085, + "learning_rate": 2.0428401639386144e-06, + "loss": 0.83331573, + "num_input_tokens_seen": 91167120, + "step": 4228, + "time_per_iteration": 2.524024724960327 + }, + { + "auxiliary_loss_clip": 0.0105483, + "auxiliary_loss_mlp": 0.01003571, + "balance_loss_clip": 1.02039552, + "balance_loss_mlp": 1.00253987, + "epoch": 0.5085071845127157, + "flos": 71817535589760.0, + "grad_norm": 0.8183674946183168, + "language_loss": 0.58113456, + "learning_rate": 2.042061367785943e-06, + "loss": 0.60171854, + "num_input_tokens_seen": 91220260, + "step": 4229, + "time_per_iteration": 3.0499186515808105 + }, + { + "auxiliary_loss_clip": 0.01143538, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.04867339, + "balance_loss_mlp": 1.022493, + "epoch": 0.5086274274033548, + "flos": 35951608252800.0, + "grad_norm": 2.068272594458992, + "language_loss": 0.74709278, + "learning_rate": 2.041282565252594e-06, + "loss": 0.76883602, + "num_input_tokens_seen": 91240425, + "step": 4230, + "time_per_iteration": 3.4399161338806152 + }, + { + "auxiliary_loss_clip": 0.01140722, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.04955876, + "balance_loss_mlp": 1.01965976, + "epoch": 0.5087476702939938, + "flos": 23513732881920.0, + "grad_norm": 1.9248725173440195, + "language_loss": 0.77212548, + "learning_rate": 2.040503756456714e-06, + "loss": 0.79380912, + "num_input_tokens_seen": 91259635, + "step": 4231, + "time_per_iteration": 2.5535709857940674 + }, + { + "auxiliary_loss_clip": 0.01160407, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.04937959, + "balance_loss_mlp": 1.02263844, + "epoch": 0.508867913184633, + "flos": 15122091841920.0, + "grad_norm": 1.8720259744479968, + "language_loss": 0.78788388, + "learning_rate": 2.0397249415164456e-06, + "loss": 0.80979919, + "num_input_tokens_seen": 91276990, + "step": 4232, + "time_per_iteration": 2.434131622314453 + }, + { + "auxiliary_loss_clip": 0.01145754, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.04710853, + "balance_loss_mlp": 1.01895523, + "epoch": 0.508988156075272, + "flos": 25885309374720.0, + "grad_norm": 1.7991617762802905, + "language_loss": 0.80021429, + "learning_rate": 2.0389461205499354e-06, + "loss": 0.82194459, + "num_input_tokens_seen": 91296125, + "step": 4233, + "time_per_iteration": 3.3895139694213867 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01029267, + "balance_loss_clip": 1.04964304, + "balance_loss_mlp": 1.02150595, + "epoch": 0.5091083989659111, + "flos": 13844857057920.0, + "grad_norm": 2.59532224720696, + "language_loss": 0.73436964, + "learning_rate": 2.03816729367533e-06, + "loss": 0.75607872, + "num_input_tokens_seen": 91314280, + "step": 4234, + "time_per_iteration": 3.333515167236328 + }, + { + "auxiliary_loss_clip": 0.01158594, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.05407727, + "balance_loss_mlp": 1.02714014, + "epoch": 0.5092286418565503, + "flos": 21104881050240.0, + "grad_norm": 2.296637304685272, + "language_loss": 0.71788311, + "learning_rate": 2.0373884610107765e-06, + "loss": 0.73982489, + "num_input_tokens_seen": 91334595, + "step": 4235, + "time_per_iteration": 2.520890235900879 + }, + { + "auxiliary_loss_clip": 0.01168491, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.04858708, + "balance_loss_mlp": 1.01885009, + "epoch": 0.5093488847471893, + "flos": 18621298972800.0, + "grad_norm": 3.6892371380204545, + "language_loss": 0.69298148, + "learning_rate": 2.0366096226744225e-06, + "loss": 0.71493697, + "num_input_tokens_seen": 91349790, + "step": 4236, + "time_per_iteration": 2.4222216606140137 + }, + { + "auxiliary_loss_clip": 0.01157311, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.05011976, + "balance_loss_mlp": 1.02727509, + "epoch": 0.5094691276378284, + "flos": 23803783205760.0, + "grad_norm": 1.780245047229767, + "language_loss": 0.7679432, + "learning_rate": 2.035830778784418e-06, + "loss": 0.78987145, + "num_input_tokens_seen": 91370465, + "step": 4237, + "time_per_iteration": 3.2091763019561768 + }, + { + "auxiliary_loss_clip": 0.01156599, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.05279922, + "balance_loss_mlp": 1.01682496, + "epoch": 0.5095893705284675, + "flos": 17420410546560.0, + "grad_norm": 1.8942234409410152, + "language_loss": 0.79986608, + "learning_rate": 2.0350519294589134e-06, + "loss": 0.82168573, + "num_input_tokens_seen": 91388505, + "step": 4238, + "time_per_iteration": 2.4703798294067383 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01023291, + "balance_loss_clip": 1.04449272, + "balance_loss_mlp": 1.01473761, + "epoch": 0.5097096134191066, + "flos": 25849362839040.0, + "grad_norm": 1.6666450294808899, + "language_loss": 0.82938766, + "learning_rate": 2.0342730748160588e-06, + "loss": 0.850806, + "num_input_tokens_seen": 91408970, + "step": 4239, + "time_per_iteration": 2.5937604904174805 + }, + { + "auxiliary_loss_clip": 0.01151448, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.04785407, + "balance_loss_mlp": 1.02061224, + "epoch": 0.5098298563097456, + "flos": 27745122844800.0, + "grad_norm": 2.869188560325743, + "language_loss": 0.706734, + "learning_rate": 2.033494214974006e-06, + "loss": 0.72853452, + "num_input_tokens_seen": 91430115, + "step": 4240, + "time_per_iteration": 2.5369811058044434 + }, + { + "auxiliary_loss_clip": 0.01141895, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.04915464, + "balance_loss_mlp": 1.02198005, + "epoch": 0.5099500992003848, + "flos": 21358913011200.0, + "grad_norm": 1.703418559764768, + "language_loss": 0.83784044, + "learning_rate": 2.0327153500509067e-06, + "loss": 0.85955751, + "num_input_tokens_seen": 91449140, + "step": 4241, + "time_per_iteration": 2.4743709564208984 + }, + { + "auxiliary_loss_clip": 0.01155068, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.05198455, + "balance_loss_mlp": 1.02207017, + "epoch": 0.5100703420910239, + "flos": 19865999013120.0, + "grad_norm": 2.0116333807045126, + "language_loss": 0.84781206, + "learning_rate": 2.031936480164916e-06, + "loss": 0.86966276, + "num_input_tokens_seen": 91466880, + "step": 4242, + "time_per_iteration": 2.4793429374694824 + }, + { + "auxiliary_loss_clip": 0.01149133, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.05286491, + "balance_loss_mlp": 1.02001047, + "epoch": 0.5101905849816629, + "flos": 24648797635200.0, + "grad_norm": 4.389376168812962, + "language_loss": 0.79635608, + "learning_rate": 2.0311576054341857e-06, + "loss": 0.8181293, + "num_input_tokens_seen": 91487495, + "step": 4243, + "time_per_iteration": 2.5181028842926025 + }, + { + "auxiliary_loss_clip": 0.01183291, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.05620539, + "balance_loss_mlp": 1.01855731, + "epoch": 0.5103108278723021, + "flos": 22930076787840.0, + "grad_norm": 1.7259891666790357, + "language_loss": 0.62566996, + "learning_rate": 2.0303787259768715e-06, + "loss": 0.64777017, + "num_input_tokens_seen": 91508395, + "step": 4244, + "time_per_iteration": 2.4243245124816895 + }, + { + "auxiliary_loss_clip": 0.0115381, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.05220366, + "balance_loss_mlp": 1.0216701, + "epoch": 0.5104310707629411, + "flos": 21506613736320.0, + "grad_norm": 2.111999253410287, + "language_loss": 0.6946708, + "learning_rate": 2.0295998419111294e-06, + "loss": 0.71650481, + "num_input_tokens_seen": 91525685, + "step": 4245, + "time_per_iteration": 2.466531991958618 + }, + { + "auxiliary_loss_clip": 0.01111886, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.04429841, + "balance_loss_mlp": 1.02376997, + "epoch": 0.5105513136535802, + "flos": 14903180403840.0, + "grad_norm": 3.5010966158179557, + "language_loss": 0.733778, + "learning_rate": 2.028820953355115e-06, + "loss": 0.75521624, + "num_input_tokens_seen": 91543785, + "step": 4246, + "time_per_iteration": 2.544123888015747 + }, + { + "auxiliary_loss_clip": 0.01158315, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.04906607, + "balance_loss_mlp": 1.02098274, + "epoch": 0.5106715565442194, + "flos": 22602212421120.0, + "grad_norm": 1.6737099940698206, + "language_loss": 0.7864427, + "learning_rate": 2.0280420604269834e-06, + "loss": 0.80832219, + "num_input_tokens_seen": 91563325, + "step": 4247, + "time_per_iteration": 2.4977834224700928 + }, + { + "auxiliary_loss_clip": 0.01067746, + "auxiliary_loss_mlp": 0.0100102, + "balance_loss_clip": 1.01934361, + "balance_loss_mlp": 1.00002432, + "epoch": 0.5107917994348584, + "flos": 71027645558400.0, + "grad_norm": 0.7001215243755687, + "language_loss": 0.58915651, + "learning_rate": 2.027263163244895e-06, + "loss": 0.60984409, + "num_input_tokens_seen": 91632450, + "step": 4248, + "time_per_iteration": 3.191516160964966 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.05384707, + "balance_loss_mlp": 1.02083135, + "epoch": 0.5109120423254975, + "flos": 24827416992000.0, + "grad_norm": 1.6677924342726513, + "language_loss": 0.74236333, + "learning_rate": 2.026484261927005e-06, + "loss": 0.76431084, + "num_input_tokens_seen": 91651945, + "step": 4249, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.05639207, + "balance_loss_mlp": 1.01845551, + "epoch": 0.5110322852161366, + "flos": 21247661612160.0, + "grad_norm": 2.009971883001086, + "language_loss": 0.74127877, + "learning_rate": 2.025705356591475e-06, + "loss": 0.76329809, + "num_input_tokens_seen": 91669635, + "step": 4250, + "time_per_iteration": 2.4449050426483154 + }, + { + "auxiliary_loss_clip": 0.01045319, + "auxiliary_loss_mlp": 0.00753165, + "balance_loss_clip": 1.01980639, + "balance_loss_mlp": 0.99992067, + "epoch": 0.5111525281067757, + "flos": 66457114358400.0, + "grad_norm": 0.755468175708005, + "language_loss": 0.57979679, + "learning_rate": 2.024926447356462e-06, + "loss": 0.59778166, + "num_input_tokens_seen": 91731920, + "step": 4251, + "time_per_iteration": 3.0303187370300293 + }, + { + "auxiliary_loss_clip": 0.01165466, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.05213022, + "balance_loss_mlp": 1.02505744, + "epoch": 0.5112727709974147, + "flos": 14866731077760.0, + "grad_norm": 2.3387553783665362, + "language_loss": 0.78711414, + "learning_rate": 2.024147534340127e-06, + "loss": 0.8091079, + "num_input_tokens_seen": 91749780, + "step": 4252, + "time_per_iteration": 2.4719812870025635 + }, + { + "auxiliary_loss_clip": 0.01148648, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.04688764, + "balance_loss_mlp": 1.01856589, + "epoch": 0.5113930138880539, + "flos": 21177600134400.0, + "grad_norm": 1.7655880521304388, + "language_loss": 0.79851246, + "learning_rate": 2.02336861766063e-06, + "loss": 0.82026362, + "num_input_tokens_seen": 91768840, + "step": 4253, + "time_per_iteration": 2.5480082035064697 + }, + { + "auxiliary_loss_clip": 0.01174237, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.05411017, + "balance_loss_mlp": 1.01945055, + "epoch": 0.511513256778693, + "flos": 20409111630720.0, + "grad_norm": 1.7485085200015165, + "language_loss": 0.78985453, + "learning_rate": 2.0225896974361327e-06, + "loss": 0.81187338, + "num_input_tokens_seen": 91788945, + "step": 4254, + "time_per_iteration": 2.5228259563446045 + }, + { + "auxiliary_loss_clip": 0.01052463, + "auxiliary_loss_mlp": 0.01003924, + "balance_loss_clip": 1.02267146, + "balance_loss_mlp": 1.0027678, + "epoch": 0.511633499669332, + "flos": 69879975131520.0, + "grad_norm": 0.8621654082889129, + "language_loss": 0.59980214, + "learning_rate": 2.0218107737847962e-06, + "loss": 0.62036598, + "num_input_tokens_seen": 91850990, + "step": 4255, + "time_per_iteration": 3.131788730621338 + }, + { + "auxiliary_loss_clip": 0.01181696, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.05496347, + "balance_loss_mlp": 1.02102172, + "epoch": 0.5117537425599712, + "flos": 24097855852800.0, + "grad_norm": 1.8905596839496766, + "language_loss": 0.7469821, + "learning_rate": 2.0210318468247826e-06, + "loss": 0.76908821, + "num_input_tokens_seen": 91869960, + "step": 4256, + "time_per_iteration": 3.301748275756836 + }, + { + "auxiliary_loss_clip": 0.01152331, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.04946208, + "balance_loss_mlp": 1.01835787, + "epoch": 0.5118739854506102, + "flos": 20959550622720.0, + "grad_norm": 1.766194154064807, + "language_loss": 0.81982529, + "learning_rate": 2.020252916674255e-06, + "loss": 0.84160548, + "num_input_tokens_seen": 91889075, + "step": 4257, + "time_per_iteration": 2.518026113510132 + }, + { + "auxiliary_loss_clip": 0.01167079, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.05014861, + "balance_loss_mlp": 1.01886749, + "epoch": 0.5119942283412493, + "flos": 17457326749440.0, + "grad_norm": 1.8069203388510389, + "language_loss": 0.81198859, + "learning_rate": 2.019473983451375e-06, + "loss": 0.8339287, + "num_input_tokens_seen": 91907495, + "step": 4258, + "time_per_iteration": 2.4647676944732666 + }, + { + "auxiliary_loss_clip": 0.01144511, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.04867578, + "balance_loss_mlp": 1.01919746, + "epoch": 0.5121144712318885, + "flos": 21066743784960.0, + "grad_norm": 2.0220925179159495, + "language_loss": 0.71626347, + "learning_rate": 2.0186950472743076e-06, + "loss": 0.7379818, + "num_input_tokens_seen": 91927400, + "step": 4259, + "time_per_iteration": 2.5598652362823486 + }, + { + "auxiliary_loss_clip": 0.01181697, + "auxiliary_loss_mlp": 0.01027339, + "balance_loss_clip": 1.0537957, + "balance_loss_mlp": 1.01917958, + "epoch": 0.5122347141225275, + "flos": 19860791541120.0, + "grad_norm": 1.6777330566778843, + "language_loss": 0.74287784, + "learning_rate": 2.0179161082612162e-06, + "loss": 0.76496816, + "num_input_tokens_seen": 91946790, + "step": 4260, + "time_per_iteration": 3.3007566928863525 + }, + { + "auxiliary_loss_clip": 0.01147798, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.04789376, + "balance_loss_mlp": 1.01999366, + "epoch": 0.5123549570131666, + "flos": 22528487756160.0, + "grad_norm": 1.9901296992834492, + "language_loss": 0.7300002, + "learning_rate": 2.017137166530266e-06, + "loss": 0.75175345, + "num_input_tokens_seen": 91966325, + "step": 4261, + "time_per_iteration": 3.289705514907837 + }, + { + "auxiliary_loss_clip": 0.01154071, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.04925203, + "balance_loss_mlp": 1.02190161, + "epoch": 0.5124751999038056, + "flos": 20333375804160.0, + "grad_norm": 2.0714490609464873, + "language_loss": 0.79990327, + "learning_rate": 2.0163582221996213e-06, + "loss": 0.82174033, + "num_input_tokens_seen": 91984700, + "step": 4262, + "time_per_iteration": 2.6133012771606445 + }, + { + "auxiliary_loss_clip": 0.01153555, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.05056953, + "balance_loss_mlp": 1.0198065, + "epoch": 0.5125954427944448, + "flos": 39785970211200.0, + "grad_norm": 2.4268200798706934, + "language_loss": 0.68229413, + "learning_rate": 2.015579275387446e-06, + "loss": 0.70410722, + "num_input_tokens_seen": 92010020, + "step": 4263, + "time_per_iteration": 3.3962647914886475 + }, + { + "auxiliary_loss_clip": 0.01144284, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.05089891, + "balance_loss_mlp": 1.02004635, + "epoch": 0.5127156856850839, + "flos": 29205394358400.0, + "grad_norm": 1.9084650485949497, + "language_loss": 0.68326336, + "learning_rate": 2.0148003262119085e-06, + "loss": 0.70498592, + "num_input_tokens_seen": 92030990, + "step": 4264, + "time_per_iteration": 2.5450198650360107 + }, + { + "auxiliary_loss_clip": 0.01137384, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.04927349, + "balance_loss_mlp": 1.0205214, + "epoch": 0.5128359285757229, + "flos": 13553693412480.0, + "grad_norm": 1.8918325748929428, + "language_loss": 0.76884156, + "learning_rate": 2.0140213747911728e-06, + "loss": 0.79050267, + "num_input_tokens_seen": 92049525, + "step": 4265, + "time_per_iteration": 2.5075795650482178 + }, + { + "auxiliary_loss_clip": 0.01135409, + "auxiliary_loss_mlp": 0.01030036, + "balance_loss_clip": 1.04993486, + "balance_loss_mlp": 1.02166152, + "epoch": 0.5129561714663621, + "flos": 25192089820800.0, + "grad_norm": 1.9729055609448587, + "language_loss": 0.80425715, + "learning_rate": 2.013242421243406e-06, + "loss": 0.82591158, + "num_input_tokens_seen": 92068430, + "step": 4266, + "time_per_iteration": 2.5770020484924316 + }, + { + "auxiliary_loss_clip": 0.01125901, + "auxiliary_loss_mlp": 0.01021837, + "balance_loss_clip": 1.05133057, + "balance_loss_mlp": 1.01396894, + "epoch": 0.5130764143570011, + "flos": 18150223080960.0, + "grad_norm": 1.4989578344089376, + "language_loss": 0.7892133, + "learning_rate": 2.012463465686774e-06, + "loss": 0.81069064, + "num_input_tokens_seen": 92088180, + "step": 4267, + "time_per_iteration": 2.601551055908203 + }, + { + "auxiliary_loss_clip": 0.01050355, + "auxiliary_loss_mlp": 0.01004367, + "balance_loss_clip": 1.03345513, + "balance_loss_mlp": 1.00287092, + "epoch": 0.5131966572476402, + "flos": 59794896418560.0, + "grad_norm": 0.7681886659106236, + "language_loss": 0.54740709, + "learning_rate": 2.0116845082394446e-06, + "loss": 0.56795424, + "num_input_tokens_seen": 92153015, + "step": 4268, + "time_per_iteration": 3.1891331672668457 + }, + { + "auxiliary_loss_clip": 0.01172765, + "auxiliary_loss_mlp": 0.01024595, + "balance_loss_clip": 1.05345535, + "balance_loss_mlp": 1.01644742, + "epoch": 0.5133169001382794, + "flos": 18515219132160.0, + "grad_norm": 1.853034544470862, + "language_loss": 0.78480369, + "learning_rate": 2.0109055490195836e-06, + "loss": 0.80677724, + "num_input_tokens_seen": 92171470, + "step": 4269, + "time_per_iteration": 2.454500913619995 + }, + { + "auxiliary_loss_clip": 0.01113876, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.04113901, + "balance_loss_mlp": 1.01811767, + "epoch": 0.5134371430289184, + "flos": 15523537219200.0, + "grad_norm": 2.2869226140867918, + "language_loss": 0.64670295, + "learning_rate": 2.0101265881453605e-06, + "loss": 0.66810036, + "num_input_tokens_seen": 92189945, + "step": 4270, + "time_per_iteration": 2.58471941947937 + }, + { + "auxiliary_loss_clip": 0.01149587, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.05353785, + "balance_loss_mlp": 1.02380776, + "epoch": 0.5135573859195575, + "flos": 21433786911360.0, + "grad_norm": 2.088858675280562, + "language_loss": 0.78387833, + "learning_rate": 2.009347625734941e-06, + "loss": 0.80568963, + "num_input_tokens_seen": 92209855, + "step": 4271, + "time_per_iteration": 2.5243873596191406 + }, + { + "auxiliary_loss_clip": 0.01187983, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.0594821, + "balance_loss_mlp": 1.01976943, + "epoch": 0.5136776288101966, + "flos": 17712651600000.0, + "grad_norm": 2.1007574975095444, + "language_loss": 0.74510866, + "learning_rate": 2.0085686619064954e-06, + "loss": 0.76726472, + "num_input_tokens_seen": 92226295, + "step": 4272, + "time_per_iteration": 2.4277310371398926 + }, + { + "auxiliary_loss_clip": 0.01173347, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.056095, + "balance_loss_mlp": 1.02272475, + "epoch": 0.5137978717008357, + "flos": 16581680997120.0, + "grad_norm": 3.1822277116267355, + "language_loss": 0.82629323, + "learning_rate": 2.00778969677819e-06, + "loss": 0.84833366, + "num_input_tokens_seen": 92243330, + "step": 4273, + "time_per_iteration": 2.473097324371338 + }, + { + "auxiliary_loss_clip": 0.0115095, + "auxiliary_loss_mlp": 0.01024046, + "balance_loss_clip": 1.05058813, + "balance_loss_mlp": 1.01648211, + "epoch": 0.5139181145914747, + "flos": 20668243322880.0, + "grad_norm": 1.9084091067323563, + "language_loss": 0.63895261, + "learning_rate": 2.0070107304681934e-06, + "loss": 0.66070259, + "num_input_tokens_seen": 92262285, + "step": 4274, + "time_per_iteration": 2.5414175987243652 + }, + { + "auxiliary_loss_clip": 0.01139716, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.05352378, + "balance_loss_mlp": 1.01905286, + "epoch": 0.5140383574821139, + "flos": 32926996546560.0, + "grad_norm": 1.8882609542090956, + "language_loss": 0.78203666, + "learning_rate": 2.006231763094675e-06, + "loss": 0.80370641, + "num_input_tokens_seen": 92283305, + "step": 4275, + "time_per_iteration": 2.6354713439941406 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.0102304, + "balance_loss_clip": 1.05418932, + "balance_loss_mlp": 1.01544595, + "epoch": 0.514158600372753, + "flos": 19537093152000.0, + "grad_norm": 2.4021131350950373, + "language_loss": 0.87360674, + "learning_rate": 2.0054527947758027e-06, + "loss": 0.89532995, + "num_input_tokens_seen": 92302105, + "step": 4276, + "time_per_iteration": 2.600318670272827 + }, + { + "auxiliary_loss_clip": 0.01071156, + "auxiliary_loss_mlp": 0.01003584, + "balance_loss_clip": 1.02186275, + "balance_loss_mlp": 1.00257671, + "epoch": 0.514278843263392, + "flos": 62523855279360.0, + "grad_norm": 0.7206748185050432, + "language_loss": 0.55898213, + "learning_rate": 2.004673825629746e-06, + "loss": 0.57972956, + "num_input_tokens_seen": 92362885, + "step": 4277, + "time_per_iteration": 3.039154291152954 + }, + { + "auxiliary_loss_clip": 0.0114906, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.05074024, + "balance_loss_mlp": 1.02107334, + "epoch": 0.5143990861540312, + "flos": 25882328545920.0, + "grad_norm": 1.5859270425814573, + "language_loss": 0.72312629, + "learning_rate": 2.0038948557746744e-06, + "loss": 0.74490374, + "num_input_tokens_seen": 92384740, + "step": 4278, + "time_per_iteration": 2.527926206588745 + }, + { + "auxiliary_loss_clip": 0.01162618, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.05229259, + "balance_loss_mlp": 1.01935351, + "epoch": 0.5145193290446702, + "flos": 23330660238720.0, + "grad_norm": 1.7345291500477011, + "language_loss": 0.75129551, + "learning_rate": 2.0031158853287558e-06, + "loss": 0.77319074, + "num_input_tokens_seen": 92405175, + "step": 4279, + "time_per_iteration": 2.477201461791992 + }, + { + "auxiliary_loss_clip": 0.011537, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.05419528, + "balance_loss_mlp": 1.02298689, + "epoch": 0.5146395719353093, + "flos": 22856603518080.0, + "grad_norm": 2.1843884344457845, + "language_loss": 0.70376295, + "learning_rate": 2.0023369144101593e-06, + "loss": 0.72560543, + "num_input_tokens_seen": 92423345, + "step": 4280, + "time_per_iteration": 2.5090935230255127 + }, + { + "auxiliary_loss_clip": 0.01142817, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.04797149, + "balance_loss_mlp": 1.02022433, + "epoch": 0.5147598148259485, + "flos": 26391577616640.0, + "grad_norm": 1.6592812695804138, + "language_loss": 0.76773685, + "learning_rate": 2.0015579431370555e-06, + "loss": 0.78944433, + "num_input_tokens_seen": 92445025, + "step": 4281, + "time_per_iteration": 2.5510001182556152 + }, + { + "auxiliary_loss_clip": 0.01164272, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.05373812, + "balance_loss_mlp": 1.01799083, + "epoch": 0.5148800577165875, + "flos": 29965694561280.0, + "grad_norm": 2.785029311071165, + "language_loss": 0.69829583, + "learning_rate": 2.000778971627612e-06, + "loss": 0.72019702, + "num_input_tokens_seen": 92464490, + "step": 4282, + "time_per_iteration": 2.52500057220459 + }, + { + "auxiliary_loss_clip": 0.0114497, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.0489862, + "balance_loss_mlp": 1.02492595, + "epoch": 0.5150003006072266, + "flos": 17931383470080.0, + "grad_norm": 1.9897955407156322, + "language_loss": 0.90266895, + "learning_rate": 2e-06, + "loss": 0.92444777, + "num_input_tokens_seen": 92482085, + "step": 4283, + "time_per_iteration": 3.3161795139312744 + }, + { + "auxiliary_loss_clip": 0.01179945, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.05495954, + "balance_loss_mlp": 1.02117157, + "epoch": 0.5151205434978657, + "flos": 18478733892480.0, + "grad_norm": 1.699246641932537, + "language_loss": 0.85659552, + "learning_rate": 1.9992210283723878e-06, + "loss": 0.87868309, + "num_input_tokens_seen": 92499325, + "step": 4284, + "time_per_iteration": 2.410841226577759 + }, + { + "auxiliary_loss_clip": 0.01180323, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.0563767, + "balance_loss_mlp": 1.01995599, + "epoch": 0.5152407863885048, + "flos": 25341263003520.0, + "grad_norm": 1.6415695985877476, + "language_loss": 0.79565156, + "learning_rate": 1.9984420568629448e-06, + "loss": 0.81772828, + "num_input_tokens_seen": 92522090, + "step": 4285, + "time_per_iteration": 2.527308940887451 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.05316257, + "balance_loss_mlp": 1.0186851, + "epoch": 0.5153610292791438, + "flos": 18329740277760.0, + "grad_norm": 2.0560583649139854, + "language_loss": 0.78259861, + "learning_rate": 1.9976630855898405e-06, + "loss": 0.80452788, + "num_input_tokens_seen": 92539845, + "step": 4286, + "time_per_iteration": 2.4286062717437744 + }, + { + "auxiliary_loss_clip": 0.01146903, + "auxiliary_loss_mlp": 0.01024541, + "balance_loss_clip": 1.04628944, + "balance_loss_mlp": 1.01700091, + "epoch": 0.515481272169783, + "flos": 30409945971840.0, + "grad_norm": 2.008342408011769, + "language_loss": 0.74458647, + "learning_rate": 1.9968841146712445e-06, + "loss": 0.76630092, + "num_input_tokens_seen": 92559460, + "step": 4287, + "time_per_iteration": 3.4090259075164795 + }, + { + "auxiliary_loss_clip": 0.01109598, + "auxiliary_loss_mlp": 0.00762022, + "balance_loss_clip": 1.04694295, + "balance_loss_mlp": 1.00020337, + "epoch": 0.5156015150604221, + "flos": 23037305863680.0, + "grad_norm": 1.6638910783958147, + "language_loss": 0.71525955, + "learning_rate": 1.996105144225326e-06, + "loss": 0.73397571, + "num_input_tokens_seen": 92579695, + "step": 4288, + "time_per_iteration": 3.406270742416382 + }, + { + "auxiliary_loss_clip": 0.01166683, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.05421185, + "balance_loss_mlp": 1.02243233, + "epoch": 0.5157217579510611, + "flos": 17858556645120.0, + "grad_norm": 1.7709147666094829, + "language_loss": 0.78625512, + "learning_rate": 1.995326174370254e-06, + "loss": 0.80822086, + "num_input_tokens_seen": 92598795, + "step": 4289, + "time_per_iteration": 2.4571619033813477 + }, + { + "auxiliary_loss_clip": 0.01163046, + "auxiliary_loss_mlp": 0.00761795, + "balance_loss_clip": 1.05121469, + "balance_loss_mlp": 1.00013304, + "epoch": 0.5158420008417003, + "flos": 19171486569600.0, + "grad_norm": 1.557681297857005, + "language_loss": 0.73106652, + "learning_rate": 1.994547205224197e-06, + "loss": 0.75031495, + "num_input_tokens_seen": 92617700, + "step": 4290, + "time_per_iteration": 3.2195868492126465 + }, + { + "auxiliary_loss_clip": 0.01145769, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.05079579, + "balance_loss_mlp": 1.01839888, + "epoch": 0.5159622437323393, + "flos": 22419534827520.0, + "grad_norm": 1.9993505968268424, + "language_loss": 0.67429018, + "learning_rate": 1.993768236905325e-06, + "loss": 0.69601369, + "num_input_tokens_seen": 92638370, + "step": 4291, + "time_per_iteration": 2.5067594051361084 + }, + { + "auxiliary_loss_clip": 0.01147966, + "auxiliary_loss_mlp": 0.01025063, + "balance_loss_clip": 1.05018187, + "balance_loss_mlp": 1.01724887, + "epoch": 0.5160824866229784, + "flos": 24603010773120.0, + "grad_norm": 2.5267436636036757, + "language_loss": 0.65778154, + "learning_rate": 1.992989269531807e-06, + "loss": 0.67951179, + "num_input_tokens_seen": 92657180, + "step": 4292, + "time_per_iteration": 2.5515949726104736 + }, + { + "auxiliary_loss_clip": 0.01152177, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.04967022, + "balance_loss_mlp": 1.01865482, + "epoch": 0.5162027295136175, + "flos": 18002737837440.0, + "grad_norm": 2.4856415194819648, + "language_loss": 0.68858957, + "learning_rate": 1.99221030322181e-06, + "loss": 0.71037626, + "num_input_tokens_seen": 92673985, + "step": 4293, + "time_per_iteration": 2.4755516052246094 + }, + { + "auxiliary_loss_clip": 0.01155531, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.05152452, + "balance_loss_mlp": 1.02004457, + "epoch": 0.5163229724042566, + "flos": 27344611221120.0, + "grad_norm": 1.6513786060270488, + "language_loss": 0.80928516, + "learning_rate": 1.991431338093505e-06, + "loss": 0.83111745, + "num_input_tokens_seen": 92696340, + "step": 4294, + "time_per_iteration": 2.544196605682373 + }, + { + "auxiliary_loss_clip": 0.01151707, + "auxiliary_loss_mlp": 0.01026931, + "balance_loss_clip": 1.0545224, + "balance_loss_mlp": 1.0196445, + "epoch": 0.5164432152948957, + "flos": 21762764599680.0, + "grad_norm": 2.8668974819848865, + "language_loss": 0.79280865, + "learning_rate": 1.9906523742650587e-06, + "loss": 0.8145951, + "num_input_tokens_seen": 92715200, + "step": 4295, + "time_per_iteration": 2.4912047386169434 + }, + { + "auxiliary_loss_clip": 0.01180272, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.05210805, + "balance_loss_mlp": 1.02317548, + "epoch": 0.5165634581855347, + "flos": 25550334115200.0, + "grad_norm": 2.05186771821765, + "language_loss": 0.77610034, + "learning_rate": 1.9898734118546397e-06, + "loss": 0.79821783, + "num_input_tokens_seen": 92735150, + "step": 4296, + "time_per_iteration": 2.498521327972412 + }, + { + "auxiliary_loss_clip": 0.01100435, + "auxiliary_loss_mlp": 0.01025664, + "balance_loss_clip": 1.04560828, + "balance_loss_mlp": 1.01712275, + "epoch": 0.5166837010761739, + "flos": 19901191363200.0, + "grad_norm": 1.5975486388966356, + "language_loss": 0.80442047, + "learning_rate": 1.989094450980416e-06, + "loss": 0.82568151, + "num_input_tokens_seen": 92755250, + "step": 4297, + "time_per_iteration": 2.6359899044036865 + }, + { + "auxiliary_loss_clip": 0.01164353, + "auxiliary_loss_mlp": 0.01023905, + "balance_loss_clip": 1.05279386, + "balance_loss_mlp": 1.01618052, + "epoch": 0.516803943966813, + "flos": 26646076454400.0, + "grad_norm": 2.10310171739443, + "language_loss": 0.76911175, + "learning_rate": 1.9883154917605556e-06, + "loss": 0.79099429, + "num_input_tokens_seen": 92774460, + "step": 4298, + "time_per_iteration": 2.5876362323760986 + }, + { + "auxiliary_loss_clip": 0.01178971, + "auxiliary_loss_mlp": 0.01022804, + "balance_loss_clip": 1.05338526, + "balance_loss_mlp": 1.01564538, + "epoch": 0.516924186857452, + "flos": 19682854542720.0, + "grad_norm": 1.9547209466611781, + "language_loss": 0.83364004, + "learning_rate": 1.9875365343132262e-06, + "loss": 0.85565782, + "num_input_tokens_seen": 92791580, + "step": 4299, + "time_per_iteration": 2.420444965362549 + }, + { + "auxiliary_loss_clip": 0.01165741, + "auxiliary_loss_mlp": 0.00762273, + "balance_loss_clip": 1.05377007, + "balance_loss_mlp": 1.00012457, + "epoch": 0.5170444297480912, + "flos": 15956583586560.0, + "grad_norm": 3.028424899236101, + "language_loss": 0.84598947, + "learning_rate": 1.9867575787565946e-06, + "loss": 0.86526966, + "num_input_tokens_seen": 92806240, + "step": 4300, + "time_per_iteration": 2.4310429096221924 + }, + { + "auxiliary_loss_clip": 0.01167281, + "auxiliary_loss_mlp": 0.01025512, + "balance_loss_clip": 1.05279255, + "balance_loss_mlp": 1.01728082, + "epoch": 0.5171646726387302, + "flos": 14174157968640.0, + "grad_norm": 2.4605083133120766, + "language_loss": 0.86404359, + "learning_rate": 1.9859786252088275e-06, + "loss": 0.88597155, + "num_input_tokens_seen": 92823420, + "step": 4301, + "time_per_iteration": 2.463771104812622 + }, + { + "auxiliary_loss_clip": 0.01141744, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.05039752, + "balance_loss_mlp": 1.02247512, + "epoch": 0.5172849155293693, + "flos": 23578550974080.0, + "grad_norm": 2.897042630731926, + "language_loss": 0.66240561, + "learning_rate": 1.9851996737880914e-06, + "loss": 0.68413377, + "num_input_tokens_seen": 92838605, + "step": 4302, + "time_per_iteration": 2.521503448486328 + }, + { + "auxiliary_loss_clip": 0.01171635, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.05365467, + "balance_loss_mlp": 1.02348089, + "epoch": 0.5174051584200084, + "flos": 14283541860480.0, + "grad_norm": 2.123928898927342, + "language_loss": 0.74533945, + "learning_rate": 1.9844207246125537e-06, + "loss": 0.767371, + "num_input_tokens_seen": 92855185, + "step": 4303, + "time_per_iteration": 2.484696865081787 + }, + { + "auxiliary_loss_clip": 0.01148577, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.04949856, + "balance_loss_mlp": 1.01698649, + "epoch": 0.5175254013106475, + "flos": 37889384192640.0, + "grad_norm": 1.7874384254251288, + "language_loss": 0.68360448, + "learning_rate": 1.983641777800379e-06, + "loss": 0.70533103, + "num_input_tokens_seen": 92877830, + "step": 4304, + "time_per_iteration": 2.6318607330322266 + }, + { + "auxiliary_loss_clip": 0.01062865, + "auxiliary_loss_mlp": 0.01005569, + "balance_loss_clip": 1.01982236, + "balance_loss_mlp": 1.00444841, + "epoch": 0.5176456442012866, + "flos": 68549737829760.0, + "grad_norm": 0.8793157424073644, + "language_loss": 0.58793604, + "learning_rate": 1.9828628334697343e-06, + "loss": 0.60862041, + "num_input_tokens_seen": 92945040, + "step": 4305, + "time_per_iteration": 3.240938901901245 + }, + { + "auxiliary_loss_clip": 0.01065804, + "auxiliary_loss_mlp": 0.01004215, + "balance_loss_clip": 1.0218854, + "balance_loss_mlp": 1.00299871, + "epoch": 0.5177658870919257, + "flos": 64084137235200.0, + "grad_norm": 0.7698451296104786, + "language_loss": 0.54749763, + "learning_rate": 1.982083891738784e-06, + "loss": 0.56819779, + "num_input_tokens_seen": 93005910, + "step": 4306, + "time_per_iteration": 3.0985476970672607 + }, + { + "auxiliary_loss_clip": 0.01147166, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.05288994, + "balance_loss_mlp": 1.01856327, + "epoch": 0.5178861299825648, + "flos": 26651248012800.0, + "grad_norm": 1.5003269605184575, + "language_loss": 0.82572871, + "learning_rate": 1.9813049527256923e-06, + "loss": 0.8474617, + "num_input_tokens_seen": 93026305, + "step": 4307, + "time_per_iteration": 2.54730224609375 + }, + { + "auxiliary_loss_clip": 0.01135141, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.04671049, + "balance_loss_mlp": 1.02073932, + "epoch": 0.5180063728732038, + "flos": 17931886260480.0, + "grad_norm": 2.4991705447756334, + "language_loss": 0.81758183, + "learning_rate": 1.9805260165486252e-06, + "loss": 0.83921587, + "num_input_tokens_seen": 93045675, + "step": 4308, + "time_per_iteration": 2.5303757190704346 + }, + { + "auxiliary_loss_clip": 0.01165803, + "auxiliary_loss_mlp": 0.01022941, + "balance_loss_clip": 1.05437827, + "balance_loss_mlp": 1.01526403, + "epoch": 0.518126615763843, + "flos": 19500895221120.0, + "grad_norm": 2.2166133050093895, + "language_loss": 0.86462986, + "learning_rate": 1.9797470833257457e-06, + "loss": 0.88651729, + "num_input_tokens_seen": 93065375, + "step": 4309, + "time_per_iteration": 3.323747158050537 + }, + { + "auxiliary_loss_clip": 0.01167648, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.05438471, + "balance_loss_mlp": 1.01899672, + "epoch": 0.5182468586544821, + "flos": 20704082117760.0, + "grad_norm": 2.261365916362636, + "language_loss": 0.77514726, + "learning_rate": 1.9789681531752177e-06, + "loss": 0.79709697, + "num_input_tokens_seen": 93085595, + "step": 4310, + "time_per_iteration": 2.465726613998413 + }, + { + "auxiliary_loss_clip": 0.01119335, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.04865932, + "balance_loss_mlp": 1.01830888, + "epoch": 0.5183671015451211, + "flos": 23112107936640.0, + "grad_norm": 1.5614632562759347, + "language_loss": 0.72529924, + "learning_rate": 1.978189226215204e-06, + "loss": 0.74674821, + "num_input_tokens_seen": 93106140, + "step": 4311, + "time_per_iteration": 2.572096586227417 + }, + { + "auxiliary_loss_clip": 0.01179527, + "auxiliary_loss_mlp": 0.01027023, + "balance_loss_clip": 1.0537281, + "balance_loss_mlp": 1.01897609, + "epoch": 0.5184873444357603, + "flos": 17597090568960.0, + "grad_norm": 2.070529119022739, + "language_loss": 0.76982635, + "learning_rate": 1.9774103025638675e-06, + "loss": 0.79189187, + "num_input_tokens_seen": 93124265, + "step": 4312, + "time_per_iteration": 2.4041223526000977 + }, + { + "auxiliary_loss_clip": 0.01127932, + "auxiliary_loss_mlp": 0.01022869, + "balance_loss_clip": 1.05382943, + "balance_loss_mlp": 1.01555252, + "epoch": 0.5186075873263993, + "flos": 24936800883840.0, + "grad_norm": 1.5604120131675898, + "language_loss": 0.76241541, + "learning_rate": 1.9766313823393696e-06, + "loss": 0.78392339, + "num_input_tokens_seen": 93145130, + "step": 4313, + "time_per_iteration": 2.588914632797241 + }, + { + "auxiliary_loss_clip": 0.01117137, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.04318452, + "balance_loss_mlp": 1.02031946, + "epoch": 0.5187278302170384, + "flos": 15190106244480.0, + "grad_norm": 2.02884406468073, + "language_loss": 0.68983698, + "learning_rate": 1.975852465659873e-06, + "loss": 0.71129072, + "num_input_tokens_seen": 93161110, + "step": 4314, + "time_per_iteration": 3.3884284496307373 + }, + { + "auxiliary_loss_clip": 0.01169039, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.05429661, + "balance_loss_mlp": 1.02309155, + "epoch": 0.5188480731076776, + "flos": 25009412227200.0, + "grad_norm": 2.0468833794700854, + "language_loss": 0.69994009, + "learning_rate": 1.9750735526435377e-06, + "loss": 0.72193801, + "num_input_tokens_seen": 93178055, + "step": 4315, + "time_per_iteration": 3.3257477283477783 + }, + { + "auxiliary_loss_clip": 0.01153357, + "auxiliary_loss_mlp": 0.01025859, + "balance_loss_clip": 1.05405092, + "balance_loss_mlp": 1.01817012, + "epoch": 0.5189683159983166, + "flos": 24790141653120.0, + "grad_norm": 2.150479590116937, + "language_loss": 0.78674585, + "learning_rate": 1.974294643408525e-06, + "loss": 0.80853796, + "num_input_tokens_seen": 93195850, + "step": 4316, + "time_per_iteration": 3.349114418029785 + }, + { + "auxiliary_loss_clip": 0.01167859, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.05047369, + "balance_loss_mlp": 1.02165174, + "epoch": 0.5190885588889557, + "flos": 24754266944640.0, + "grad_norm": 4.668884775065209, + "language_loss": 0.67125702, + "learning_rate": 1.9735157380729947e-06, + "loss": 0.69322664, + "num_input_tokens_seen": 93216260, + "step": 4317, + "time_per_iteration": 2.485344409942627 + }, + { + "auxiliary_loss_clip": 0.01153076, + "auxiliary_loss_mlp": 0.01022755, + "balance_loss_clip": 1.04970074, + "balance_loss_mlp": 1.01567125, + "epoch": 0.5192088017795948, + "flos": 24712646060160.0, + "grad_norm": 1.8294509732764275, + "language_loss": 0.84534085, + "learning_rate": 1.9727368367551053e-06, + "loss": 0.86709917, + "num_input_tokens_seen": 93234810, + "step": 4318, + "time_per_iteration": 2.523404121398926 + }, + { + "auxiliary_loss_clip": 0.01138333, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.04777312, + "balance_loss_mlp": 1.01807272, + "epoch": 0.5193290446702339, + "flos": 27229588894080.0, + "grad_norm": 1.9492075566846674, + "language_loss": 0.68504083, + "learning_rate": 1.9719579395730164e-06, + "loss": 0.70668137, + "num_input_tokens_seen": 93254185, + "step": 4319, + "time_per_iteration": 2.528857946395874 + }, + { + "auxiliary_loss_clip": 0.01181693, + "auxiliary_loss_mlp": 0.01023542, + "balance_loss_clip": 1.05628467, + "balance_loss_mlp": 1.01593351, + "epoch": 0.5194492875608729, + "flos": 11473352392320.0, + "grad_norm": 2.370936817676572, + "language_loss": 0.93126476, + "learning_rate": 1.9711790466448854e-06, + "loss": 0.95331705, + "num_input_tokens_seen": 93268205, + "step": 4320, + "time_per_iteration": 2.37650990486145 + }, + { + "auxiliary_loss_clip": 0.01128511, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.04931116, + "balance_loss_mlp": 1.02584112, + "epoch": 0.5195695304515121, + "flos": 20338906498560.0, + "grad_norm": 2.0874242588894107, + "language_loss": 0.71438462, + "learning_rate": 1.9704001580888704e-06, + "loss": 0.73600817, + "num_input_tokens_seen": 93286945, + "step": 4321, + "time_per_iteration": 2.5591723918914795 + }, + { + "auxiliary_loss_clip": 0.0114406, + "auxiliary_loss_mlp": 0.00762592, + "balance_loss_clip": 1.04641712, + "balance_loss_mlp": 1.00028682, + "epoch": 0.5196897733421512, + "flos": 20048317470720.0, + "grad_norm": 3.0046490005712054, + "language_loss": 0.8701604, + "learning_rate": 1.9696212740231283e-06, + "loss": 0.88922691, + "num_input_tokens_seen": 93305595, + "step": 4322, + "time_per_iteration": 2.4885165691375732 + }, + { + "auxiliary_loss_clip": 0.01172126, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.0511024, + "balance_loss_mlp": 1.01629353, + "epoch": 0.5198100162327902, + "flos": 23805507058560.0, + "grad_norm": 2.139084722899328, + "language_loss": 0.82582247, + "learning_rate": 1.9688423945658146e-06, + "loss": 0.84778965, + "num_input_tokens_seen": 93326460, + "step": 4323, + "time_per_iteration": 2.487976312637329 + }, + { + "auxiliary_loss_clip": 0.01113229, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.0415988, + "balance_loss_mlp": 1.01821399, + "epoch": 0.5199302591234293, + "flos": 24023951619840.0, + "grad_norm": 1.985345452166628, + "language_loss": 0.7154538, + "learning_rate": 1.9680635198350845e-06, + "loss": 0.73685181, + "num_input_tokens_seen": 93346170, + "step": 4324, + "time_per_iteration": 2.576594829559326 + }, + { + "auxiliary_loss_clip": 0.01165487, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.04954374, + "balance_loss_mlp": 1.02458119, + "epoch": 0.5200505020140684, + "flos": 26359366095360.0, + "grad_norm": 2.012401828540083, + "language_loss": 0.72556275, + "learning_rate": 1.967284649949093e-06, + "loss": 0.74754739, + "num_input_tokens_seen": 93365380, + "step": 4325, + "time_per_iteration": 2.512523651123047 + }, + { + "auxiliary_loss_clip": 0.01135533, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04761183, + "balance_loss_mlp": 1.02251267, + "epoch": 0.5201707449047075, + "flos": 39604262284800.0, + "grad_norm": 1.7634997804006274, + "language_loss": 0.72408974, + "learning_rate": 1.966505785025994e-06, + "loss": 0.74575114, + "num_input_tokens_seen": 93387285, + "step": 4326, + "time_per_iteration": 2.67545485496521 + }, + { + "auxiliary_loss_clip": 0.0113476, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.04808664, + "balance_loss_mlp": 1.01947165, + "epoch": 0.5202909877953465, + "flos": 53682788292480.0, + "grad_norm": 1.791571356702319, + "language_loss": 0.76148891, + "learning_rate": 1.965726925183941e-06, + "loss": 0.78311223, + "num_input_tokens_seen": 93410390, + "step": 4327, + "time_per_iteration": 2.8161420822143555 + }, + { + "auxiliary_loss_clip": 0.0118042, + "auxiliary_loss_mlp": 0.01022503, + "balance_loss_clip": 1.05524254, + "balance_loss_mlp": 1.01525545, + "epoch": 0.5204112306859857, + "flos": 19537021324800.0, + "grad_norm": 1.7586306918358665, + "language_loss": 0.84700787, + "learning_rate": 1.964948070541087e-06, + "loss": 0.86903709, + "num_input_tokens_seen": 93429050, + "step": 4328, + "time_per_iteration": 2.4287045001983643 + }, + { + "auxiliary_loss_clip": 0.01153016, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04685843, + "balance_loss_mlp": 1.02280903, + "epoch": 0.5205314735766248, + "flos": 15304697608320.0, + "grad_norm": 2.4499090544097184, + "language_loss": 0.69810385, + "learning_rate": 1.9641692212155816e-06, + "loss": 0.71994138, + "num_input_tokens_seen": 93446815, + "step": 4329, + "time_per_iteration": 2.451185464859009 + }, + { + "auxiliary_loss_clip": 0.01123511, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.05195141, + "balance_loss_mlp": 1.02306128, + "epoch": 0.5206517164672638, + "flos": 59263701160320.0, + "grad_norm": 1.9996389057800341, + "language_loss": 0.72494018, + "learning_rate": 1.9633903773255777e-06, + "loss": 0.74648714, + "num_input_tokens_seen": 93469130, + "step": 4330, + "time_per_iteration": 2.8791377544403076 + }, + { + "auxiliary_loss_clip": 0.01176565, + "auxiliary_loss_mlp": 0.0102619, + "balance_loss_clip": 1.05012131, + "balance_loss_mlp": 1.01829851, + "epoch": 0.520771959357903, + "flos": 26871129118080.0, + "grad_norm": 1.7086111102781905, + "language_loss": 0.7495327, + "learning_rate": 1.9626115389892237e-06, + "loss": 0.77156025, + "num_input_tokens_seen": 93489920, + "step": 4331, + "time_per_iteration": 2.498196601867676 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01024783, + "balance_loss_clip": 1.04956031, + "balance_loss_mlp": 1.01695704, + "epoch": 0.520892202248542, + "flos": 26907075653760.0, + "grad_norm": 2.7606361747193757, + "language_loss": 0.85726553, + "learning_rate": 1.96183270632467e-06, + "loss": 0.87894404, + "num_input_tokens_seen": 93509770, + "step": 4332, + "time_per_iteration": 2.6455507278442383 + }, + { + "auxiliary_loss_clip": 0.0112943, + "auxiliary_loss_mlp": 0.00762793, + "balance_loss_clip": 1.04587126, + "balance_loss_mlp": 1.0001626, + "epoch": 0.5210124451391811, + "flos": 25849434666240.0, + "grad_norm": 1.7626853461122032, + "language_loss": 0.79265553, + "learning_rate": 1.9610538794500644e-06, + "loss": 0.81157768, + "num_input_tokens_seen": 93529320, + "step": 4333, + "time_per_iteration": 2.5401036739349365 + }, + { + "auxiliary_loss_clip": 0.01051956, + "auxiliary_loss_mlp": 0.01003295, + "balance_loss_clip": 1.01970232, + "balance_loss_mlp": 1.00224614, + "epoch": 0.5211326880298203, + "flos": 70553804319360.0, + "grad_norm": 0.7727918858617991, + "language_loss": 0.59494221, + "learning_rate": 1.9602750584835542e-06, + "loss": 0.61549473, + "num_input_tokens_seen": 93595255, + "step": 4334, + "time_per_iteration": 3.2483842372894287 + }, + { + "auxiliary_loss_clip": 0.01147472, + "auxiliary_loss_mlp": 0.0102291, + "balance_loss_clip": 1.04837263, + "balance_loss_mlp": 1.01527476, + "epoch": 0.5212529309204593, + "flos": 15628898787840.0, + "grad_norm": 1.9299286669331854, + "language_loss": 0.82301176, + "learning_rate": 1.959496243543286e-06, + "loss": 0.84471554, + "num_input_tokens_seen": 93613135, + "step": 4335, + "time_per_iteration": 3.2852776050567627 + }, + { + "auxiliary_loss_clip": 0.01169145, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.05653191, + "balance_loss_mlp": 1.02782249, + "epoch": 0.5213731738110984, + "flos": 26242655829120.0, + "grad_norm": 2.013150200395761, + "language_loss": 0.79077005, + "learning_rate": 1.9587174347474057e-06, + "loss": 0.81282187, + "num_input_tokens_seen": 93629645, + "step": 4336, + "time_per_iteration": 2.4735939502716064 + }, + { + "auxiliary_loss_clip": 0.011091, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.04402423, + "balance_loss_mlp": 1.02120566, + "epoch": 0.5214934167017375, + "flos": 19418407637760.0, + "grad_norm": 2.116526222975386, + "language_loss": 0.82434964, + "learning_rate": 1.9579386322140574e-06, + "loss": 0.84573174, + "num_input_tokens_seen": 93645325, + "step": 4337, + "time_per_iteration": 2.5194547176361084 + }, + { + "auxiliary_loss_clip": 0.0118239, + "auxiliary_loss_mlp": 0.00762601, + "balance_loss_clip": 1.05463576, + "balance_loss_mlp": 1.00025547, + "epoch": 0.5216136595923766, + "flos": 30955788023040.0, + "grad_norm": 1.7550947998859048, + "language_loss": 0.80700946, + "learning_rate": 1.9571598360613854e-06, + "loss": 0.82645935, + "num_input_tokens_seen": 93668200, + "step": 4338, + "time_per_iteration": 2.488835334777832 + }, + { + "auxiliary_loss_clip": 0.01134789, + "auxiliary_loss_mlp": 0.01026559, + "balance_loss_clip": 1.04458201, + "balance_loss_mlp": 1.01879263, + "epoch": 0.5217339024830157, + "flos": 21945047143680.0, + "grad_norm": 2.0776521547677693, + "language_loss": 0.6973902, + "learning_rate": 1.956381046407532e-06, + "loss": 0.71900368, + "num_input_tokens_seen": 93688495, + "step": 4339, + "time_per_iteration": 2.475827932357788 + }, + { + "auxiliary_loss_clip": 0.01132451, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.04680729, + "balance_loss_mlp": 1.02343869, + "epoch": 0.5218541453736548, + "flos": 20923209037440.0, + "grad_norm": 1.7810824338663827, + "language_loss": 0.86097682, + "learning_rate": 1.9556022633706394e-06, + "loss": 0.88261497, + "num_input_tokens_seen": 93707285, + "step": 4340, + "time_per_iteration": 3.3773446083068848 + }, + { + "auxiliary_loss_clip": 0.01143484, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.04903436, + "balance_loss_mlp": 1.02143347, + "epoch": 0.5219743882642939, + "flos": 23951663498880.0, + "grad_norm": 1.6106960482397852, + "language_loss": 0.79703474, + "learning_rate": 1.954823487068848e-06, + "loss": 0.8187623, + "num_input_tokens_seen": 93727495, + "step": 4341, + "time_per_iteration": 3.3566136360168457 + }, + { + "auxiliary_loss_clip": 0.01165518, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.05290639, + "balance_loss_mlp": 1.02135944, + "epoch": 0.5220946311549329, + "flos": 28799280213120.0, + "grad_norm": 1.672869638190347, + "language_loss": 0.80807132, + "learning_rate": 1.9540447176202976e-06, + "loss": 0.83001512, + "num_input_tokens_seen": 93748740, + "step": 4342, + "time_per_iteration": 2.521245241165161 + }, + { + "auxiliary_loss_clip": 0.01067541, + "auxiliary_loss_mlp": 0.01001622, + "balance_loss_clip": 1.01848412, + "balance_loss_mlp": 1.00064456, + "epoch": 0.5222148740455721, + "flos": 67189369017600.0, + "grad_norm": 0.8879401036442628, + "language_loss": 0.60693657, + "learning_rate": 1.9532659551431272e-06, + "loss": 0.62762821, + "num_input_tokens_seen": 93815770, + "step": 4343, + "time_per_iteration": 3.2200825214385986 + }, + { + "auxiliary_loss_clip": 0.01166344, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.05193317, + "balance_loss_mlp": 1.01924598, + "epoch": 0.5223351169362112, + "flos": 61856164339200.0, + "grad_norm": 1.5510103453136361, + "language_loss": 0.67752343, + "learning_rate": 1.9524871997554744e-06, + "loss": 0.69945323, + "num_input_tokens_seen": 93843530, + "step": 4344, + "time_per_iteration": 3.5065371990203857 + }, + { + "auxiliary_loss_clip": 0.01165432, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.05167866, + "balance_loss_mlp": 1.0191927, + "epoch": 0.5224553598268502, + "flos": 14647388676480.0, + "grad_norm": 5.207205442187628, + "language_loss": 0.81162918, + "learning_rate": 1.951708451575475e-06, + "loss": 0.8335526, + "num_input_tokens_seen": 93860595, + "step": 4345, + "time_per_iteration": 2.4215919971466064 + }, + { + "auxiliary_loss_clip": 0.01143969, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.04691207, + "balance_loss_mlp": 1.02298737, + "epoch": 0.5225756027174894, + "flos": 14826043946880.0, + "grad_norm": 2.2067932277279616, + "language_loss": 0.82118744, + "learning_rate": 1.9509297107212657e-06, + "loss": 0.84293306, + "num_input_tokens_seen": 93877365, + "step": 4346, + "time_per_iteration": 2.481962203979492 + }, + { + "auxiliary_loss_clip": 0.01175944, + "auxiliary_loss_mlp": 0.01025648, + "balance_loss_clip": 1.05192351, + "balance_loss_mlp": 1.01818001, + "epoch": 0.5226958456081284, + "flos": 23512009029120.0, + "grad_norm": 1.7004833271668003, + "language_loss": 0.79269499, + "learning_rate": 1.95015097731098e-06, + "loss": 0.81471092, + "num_input_tokens_seen": 93896855, + "step": 4347, + "time_per_iteration": 2.435988664627075 + }, + { + "auxiliary_loss_clip": 0.01177566, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.05310202, + "balance_loss_mlp": 1.0179503, + "epoch": 0.5228160884987675, + "flos": 19062928690560.0, + "grad_norm": 4.298942695852673, + "language_loss": 0.81880325, + "learning_rate": 1.949372251462751e-06, + "loss": 0.8408339, + "num_input_tokens_seen": 93914270, + "step": 4348, + "time_per_iteration": 2.410475492477417 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.00761949, + "balance_loss_clip": 1.04916906, + "balance_loss_mlp": 1.00028086, + "epoch": 0.5229363313894067, + "flos": 21063224252160.0, + "grad_norm": 1.9589558864326888, + "language_loss": 0.82839519, + "learning_rate": 1.9485935332947124e-06, + "loss": 0.84737784, + "num_input_tokens_seen": 93932180, + "step": 4349, + "time_per_iteration": 2.55025577545166 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01023202, + "balance_loss_clip": 1.04954123, + "balance_loss_mlp": 1.01607358, + "epoch": 0.5230565742800457, + "flos": 14830389492480.0, + "grad_norm": 2.524751867110256, + "language_loss": 0.84168446, + "learning_rate": 1.947814822924993e-06, + "loss": 0.86335015, + "num_input_tokens_seen": 93949690, + "step": 4350, + "time_per_iteration": 2.4523684978485107 + }, + { + "auxiliary_loss_clip": 0.01177895, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.05479002, + "balance_loss_mlp": 1.02355731, + "epoch": 0.5231768171706848, + "flos": 25813021253760.0, + "grad_norm": 1.8300090896213734, + "language_loss": 0.82848084, + "learning_rate": 1.9470361204717236e-06, + "loss": 0.85056877, + "num_input_tokens_seen": 93968830, + "step": 4351, + "time_per_iteration": 2.571258068084717 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.00762291, + "balance_loss_clip": 1.0495981, + "balance_loss_mlp": 1.00025225, + "epoch": 0.5232970600613239, + "flos": 22743807834240.0, + "grad_norm": 1.693903220217893, + "language_loss": 0.80695319, + "learning_rate": 1.9462574260530326e-06, + "loss": 0.82596999, + "num_input_tokens_seen": 93989110, + "step": 4352, + "time_per_iteration": 2.5304713249206543 + }, + { + "auxiliary_loss_clip": 0.01154177, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.04898679, + "balance_loss_mlp": 1.01900768, + "epoch": 0.523417302951963, + "flos": 17310703432320.0, + "grad_norm": 1.7964250712837848, + "language_loss": 0.80760342, + "learning_rate": 1.9454787397870472e-06, + "loss": 0.82941359, + "num_input_tokens_seen": 94006430, + "step": 4353, + "time_per_iteration": 2.4470584392547607 + }, + { + "auxiliary_loss_clip": 0.01103339, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.04832923, + "balance_loss_mlp": 1.02044952, + "epoch": 0.523537545842602, + "flos": 18551740285440.0, + "grad_norm": 1.770964479083237, + "language_loss": 0.71991873, + "learning_rate": 1.944700061791894e-06, + "loss": 0.74123693, + "num_input_tokens_seen": 94024825, + "step": 4354, + "time_per_iteration": 2.6146578788757324 + }, + { + "auxiliary_loss_clip": 0.01162298, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.05246806, + "balance_loss_mlp": 1.02224326, + "epoch": 0.5236577887332411, + "flos": 19719267955200.0, + "grad_norm": 2.612381994926738, + "language_loss": 0.65407026, + "learning_rate": 1.943921392185698e-06, + "loss": 0.67599326, + "num_input_tokens_seen": 94043450, + "step": 4355, + "time_per_iteration": 2.5120255947113037 + }, + { + "auxiliary_loss_clip": 0.01152573, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.0486176, + "balance_loss_mlp": 1.01908731, + "epoch": 0.5237780316238803, + "flos": 23550218121600.0, + "grad_norm": 2.1571417274222675, + "language_loss": 0.77540857, + "learning_rate": 1.9431427310865814e-06, + "loss": 0.79720229, + "num_input_tokens_seen": 94063055, + "step": 4356, + "time_per_iteration": 2.5682945251464844 + }, + { + "auxiliary_loss_clip": 0.01123109, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.04862618, + "balance_loss_mlp": 1.02081442, + "epoch": 0.5238982745145193, + "flos": 22491894775680.0, + "grad_norm": 1.6814132975443963, + "language_loss": 0.78262562, + "learning_rate": 1.942364078612667e-06, + "loss": 0.80414712, + "num_input_tokens_seen": 94081785, + "step": 4357, + "time_per_iteration": 2.5321431159973145 + }, + { + "auxiliary_loss_clip": 0.0114305, + "auxiliary_loss_mlp": 0.01024575, + "balance_loss_clip": 1.04884946, + "balance_loss_mlp": 1.01734471, + "epoch": 0.5240185174051584, + "flos": 27088927234560.0, + "grad_norm": 2.023984857391796, + "language_loss": 0.75546193, + "learning_rate": 1.9415854348820765e-06, + "loss": 0.77713811, + "num_input_tokens_seen": 94101635, + "step": 4358, + "time_per_iteration": 2.5493717193603516 + }, + { + "auxiliary_loss_clip": 0.01168851, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.05036139, + "balance_loss_mlp": 1.01940095, + "epoch": 0.5241387602957975, + "flos": 22674680110080.0, + "grad_norm": 2.0470814232962895, + "language_loss": 0.67465502, + "learning_rate": 1.940806800012929e-06, + "loss": 0.69662118, + "num_input_tokens_seen": 94121705, + "step": 4359, + "time_per_iteration": 2.460353374481201 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.00762406, + "balance_loss_clip": 1.04751372, + "balance_loss_mlp": 1.00024033, + "epoch": 0.5242590031864366, + "flos": 40553453134080.0, + "grad_norm": 1.6004649444778154, + "language_loss": 0.63456345, + "learning_rate": 1.9400281741233432e-06, + "loss": 0.65336347, + "num_input_tokens_seen": 94146595, + "step": 4360, + "time_per_iteration": 2.725044012069702 + }, + { + "auxiliary_loss_clip": 0.01039033, + "auxiliary_loss_mlp": 0.01008526, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.00757241, + "epoch": 0.5243792460770756, + "flos": 66676313105280.0, + "grad_norm": 0.6555360742272462, + "language_loss": 0.52552974, + "learning_rate": 1.939249557331435e-06, + "loss": 0.54600537, + "num_input_tokens_seen": 94212410, + "step": 4361, + "time_per_iteration": 3.1416094303131104 + }, + { + "auxiliary_loss_clip": 0.01144121, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.05019045, + "balance_loss_mlp": 1.02224588, + "epoch": 0.5244994889677148, + "flos": 28183663992960.0, + "grad_norm": 1.913655570960772, + "language_loss": 0.72607535, + "learning_rate": 1.938470949755321e-06, + "loss": 0.74781001, + "num_input_tokens_seen": 94232290, + "step": 4362, + "time_per_iteration": 3.411875009536743 + }, + { + "auxiliary_loss_clip": 0.01047553, + "auxiliary_loss_mlp": 0.01006545, + "balance_loss_clip": 1.01656938, + "balance_loss_mlp": 1.00555539, + "epoch": 0.5246197318583539, + "flos": 65950379239680.0, + "grad_norm": 0.81850348095091, + "language_loss": 0.55699503, + "learning_rate": 1.937692351513115e-06, + "loss": 0.57753599, + "num_input_tokens_seen": 94291285, + "step": 4363, + "time_per_iteration": 3.0562753677368164 + }, + { + "auxiliary_loss_clip": 0.01168707, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.05165184, + "balance_loss_mlp": 1.0186832, + "epoch": 0.5247399747489929, + "flos": 21033490769280.0, + "grad_norm": 1.541235835049601, + "language_loss": 0.80282021, + "learning_rate": 1.9369137627229297e-06, + "loss": 0.82476878, + "num_input_tokens_seen": 94309685, + "step": 4364, + "time_per_iteration": 2.455899477005005 + }, + { + "auxiliary_loss_clip": 0.01163184, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.05228126, + "balance_loss_mlp": 1.0191952, + "epoch": 0.5248602176396321, + "flos": 19025940660480.0, + "grad_norm": 2.280632731269297, + "language_loss": 0.88028109, + "learning_rate": 1.936135183502877e-06, + "loss": 0.90218151, + "num_input_tokens_seen": 94326985, + "step": 4365, + "time_per_iteration": 2.469602108001709 + }, + { + "auxiliary_loss_clip": 0.01139628, + "auxiliary_loss_mlp": 0.01024871, + "balance_loss_clip": 1.04844642, + "balance_loss_mlp": 1.01735473, + "epoch": 0.5249804605302711, + "flos": 22200084685440.0, + "grad_norm": 2.1876929504225076, + "language_loss": 0.79830396, + "learning_rate": 1.935356613971066e-06, + "loss": 0.81994891, + "num_input_tokens_seen": 94347645, + "step": 4366, + "time_per_iteration": 3.388249397277832 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.00761961, + "balance_loss_clip": 1.04911792, + "balance_loss_mlp": 1.0002172, + "epoch": 0.5251007034209102, + "flos": 23805686626560.0, + "grad_norm": 1.7929640130798927, + "language_loss": 0.76504159, + "learning_rate": 1.9345780542456047e-06, + "loss": 0.78412938, + "num_input_tokens_seen": 94367020, + "step": 4367, + "time_per_iteration": 2.51507568359375 + }, + { + "auxiliary_loss_clip": 0.0115283, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.04873621, + "balance_loss_mlp": 1.02101707, + "epoch": 0.5252209463115494, + "flos": 23294605962240.0, + "grad_norm": 2.4416101440400015, + "language_loss": 0.71907568, + "learning_rate": 1.9337995044446007e-06, + "loss": 0.74088907, + "num_input_tokens_seen": 94385860, + "step": 4368, + "time_per_iteration": 3.278944492340088 + }, + { + "auxiliary_loss_clip": 0.01168199, + "auxiliary_loss_mlp": 0.01028193, + "balance_loss_clip": 1.05153382, + "balance_loss_mlp": 1.0203433, + "epoch": 0.5253411892021884, + "flos": 19828687760640.0, + "grad_norm": 2.044737898576299, + "language_loss": 0.79873681, + "learning_rate": 1.9330209646861596e-06, + "loss": 0.82070065, + "num_input_tokens_seen": 94405010, + "step": 4369, + "time_per_iteration": 2.4421024322509766 + }, + { + "auxiliary_loss_clip": 0.0114528, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.04863429, + "balance_loss_mlp": 1.02292848, + "epoch": 0.5254614320928275, + "flos": 24133730561280.0, + "grad_norm": 1.9552026212971447, + "language_loss": 0.77821803, + "learning_rate": 1.9322424350883843e-06, + "loss": 0.79997075, + "num_input_tokens_seen": 94426845, + "step": 4370, + "time_per_iteration": 2.5154197216033936 + }, + { + "auxiliary_loss_clip": 0.01150655, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.04898167, + "balance_loss_mlp": 1.02157307, + "epoch": 0.5255816749834666, + "flos": 24644954880000.0, + "grad_norm": 2.5500733863189264, + "language_loss": 0.78570747, + "learning_rate": 1.931463915769379e-06, + "loss": 0.80750191, + "num_input_tokens_seen": 94446960, + "step": 4371, + "time_per_iteration": 3.2991833686828613 + }, + { + "auxiliary_loss_clip": 0.01118164, + "auxiliary_loss_mlp": 0.01024478, + "balance_loss_clip": 1.04520321, + "balance_loss_mlp": 1.01684523, + "epoch": 0.5257019178741057, + "flos": 14136595320960.0, + "grad_norm": 2.242927013176805, + "language_loss": 0.74336219, + "learning_rate": 1.930685406847242e-06, + "loss": 0.76478863, + "num_input_tokens_seen": 94461535, + "step": 4372, + "time_per_iteration": 2.5128729343414307 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.04809499, + "balance_loss_mlp": 1.01987433, + "epoch": 0.5258221607647448, + "flos": 23548961145600.0, + "grad_norm": 1.4621954235803154, + "language_loss": 0.81530154, + "learning_rate": 1.9299069084400734e-06, + "loss": 0.83702195, + "num_input_tokens_seen": 94482395, + "step": 4373, + "time_per_iteration": 2.51153564453125 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.05137515, + "balance_loss_mlp": 1.01860726, + "epoch": 0.5259424036553839, + "flos": 24966103403520.0, + "grad_norm": 1.9663180526975068, + "language_loss": 0.69542682, + "learning_rate": 1.9291284206659717e-06, + "loss": 0.71702325, + "num_input_tokens_seen": 94500580, + "step": 4374, + "time_per_iteration": 2.5352210998535156 + }, + { + "auxiliary_loss_clip": 0.01179488, + "auxiliary_loss_mlp": 0.01021644, + "balance_loss_clip": 1.05401397, + "balance_loss_mlp": 1.01370513, + "epoch": 0.526062646546023, + "flos": 28763908295040.0, + "grad_norm": 2.2382752326694977, + "language_loss": 0.7157954, + "learning_rate": 1.928349943643032e-06, + "loss": 0.73780674, + "num_input_tokens_seen": 94519680, + "step": 4375, + "time_per_iteration": 2.4739067554473877 + }, + { + "auxiliary_loss_clip": 0.01157843, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.05037689, + "balance_loss_mlp": 1.02052212, + "epoch": 0.526182889436662, + "flos": 22821375254400.0, + "grad_norm": 1.8576834313800896, + "language_loss": 0.81786835, + "learning_rate": 1.9275714774893493e-06, + "loss": 0.83972967, + "num_input_tokens_seen": 94539135, + "step": 4376, + "time_per_iteration": 2.4604790210723877 + }, + { + "auxiliary_loss_clip": 0.01123334, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.04401994, + "balance_loss_mlp": 1.02013004, + "epoch": 0.5263031323273012, + "flos": 22929466256640.0, + "grad_norm": 2.182211073509432, + "language_loss": 0.73040879, + "learning_rate": 1.9267930223230154e-06, + "loss": 0.75192642, + "num_input_tokens_seen": 94557610, + "step": 4377, + "time_per_iteration": 2.5353589057922363 + }, + { + "auxiliary_loss_clip": 0.01152057, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.05085623, + "balance_loss_mlp": 1.0202837, + "epoch": 0.5264233752179402, + "flos": 17748634049280.0, + "grad_norm": 2.4095937786058936, + "language_loss": 0.779791, + "learning_rate": 1.9260145782621224e-06, + "loss": 0.80158746, + "num_input_tokens_seen": 94575390, + "step": 4378, + "time_per_iteration": 2.4543614387512207 + }, + { + "auxiliary_loss_clip": 0.01147088, + "auxiliary_loss_mlp": 0.01026594, + "balance_loss_clip": 1.05152142, + "balance_loss_mlp": 1.01914334, + "epoch": 0.5265436181085793, + "flos": 24421626069120.0, + "grad_norm": 1.947542888374729, + "language_loss": 0.87916994, + "learning_rate": 1.925236145424758e-06, + "loss": 0.9009068, + "num_input_tokens_seen": 94594210, + "step": 4379, + "time_per_iteration": 2.508543014526367 + }, + { + "auxiliary_loss_clip": 0.0106866, + "auxiliary_loss_mlp": 0.01002682, + "balance_loss_clip": 1.01650429, + "balance_loss_mlp": 1.00184727, + "epoch": 0.5266638609992185, + "flos": 69207298156800.0, + "grad_norm": 0.697470827288729, + "language_loss": 0.57586503, + "learning_rate": 1.924457723929012e-06, + "loss": 0.59657848, + "num_input_tokens_seen": 94665020, + "step": 4380, + "time_per_iteration": 3.174663782119751 + }, + { + "auxiliary_loss_clip": 0.01162604, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.05070293, + "balance_loss_mlp": 1.01727295, + "epoch": 0.5267841038898575, + "flos": 20738699850240.0, + "grad_norm": 2.304291087725399, + "language_loss": 0.83023971, + "learning_rate": 1.9236793138929685e-06, + "loss": 0.85211337, + "num_input_tokens_seen": 94684290, + "step": 4381, + "time_per_iteration": 2.45731258392334 + }, + { + "auxiliary_loss_clip": 0.01167551, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.05054665, + "balance_loss_mlp": 1.01741672, + "epoch": 0.5269043467804966, + "flos": 17234392988160.0, + "grad_norm": 1.9429832661019535, + "language_loss": 0.81147134, + "learning_rate": 1.9229009154347133e-06, + "loss": 0.83339763, + "num_input_tokens_seen": 94701880, + "step": 4382, + "time_per_iteration": 2.4214773178100586 + }, + { + "auxiliary_loss_clip": 0.01106013, + "auxiliary_loss_mlp": 0.00761869, + "balance_loss_clip": 1.04324234, + "balance_loss_mlp": 1.00023746, + "epoch": 0.5270245896711357, + "flos": 18223157646720.0, + "grad_norm": 2.029056836800655, + "language_loss": 0.80564171, + "learning_rate": 1.922122528672327e-06, + "loss": 0.82432055, + "num_input_tokens_seen": 94720545, + "step": 4383, + "time_per_iteration": 2.529179096221924 + }, + { + "auxiliary_loss_clip": 0.0117295, + "auxiliary_loss_mlp": 0.01023731, + "balance_loss_clip": 1.05089211, + "balance_loss_mlp": 1.01635194, + "epoch": 0.5271448325617748, + "flos": 21287558643840.0, + "grad_norm": 2.386289761773235, + "language_loss": 0.780711, + "learning_rate": 1.9213441537238914e-06, + "loss": 0.80267781, + "num_input_tokens_seen": 94737420, + "step": 4384, + "time_per_iteration": 2.4164559841156006 + }, + { + "auxiliary_loss_clip": 0.01030032, + "auxiliary_loss_mlp": 0.01001423, + "balance_loss_clip": 1.02040553, + "balance_loss_mlp": 1.00047517, + "epoch": 0.5272650754524139, + "flos": 65495497403520.0, + "grad_norm": 0.841077346623064, + "language_loss": 0.57383412, + "learning_rate": 1.920565790707485e-06, + "loss": 0.5941487, + "num_input_tokens_seen": 94802810, + "step": 4385, + "time_per_iteration": 3.2633578777313232 + }, + { + "auxiliary_loss_clip": 0.01129808, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.04693341, + "balance_loss_mlp": 1.0197506, + "epoch": 0.527385318343053, + "flos": 19676426008320.0, + "grad_norm": 1.8964439843109433, + "language_loss": 0.65567815, + "learning_rate": 1.9197874397411853e-06, + "loss": 0.67725527, + "num_input_tokens_seen": 94819440, + "step": 4386, + "time_per_iteration": 2.5258615016937256 + }, + { + "auxiliary_loss_clip": 0.01134758, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.04434276, + "balance_loss_mlp": 1.02490032, + "epoch": 0.5275055612336921, + "flos": 12712018947840.0, + "grad_norm": 3.2809255173526526, + "language_loss": 0.66693175, + "learning_rate": 1.919009100943067e-06, + "loss": 0.6886133, + "num_input_tokens_seen": 94835130, + "step": 4387, + "time_per_iteration": 2.471652030944824 + }, + { + "auxiliary_loss_clip": 0.01132024, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.0480566, + "balance_loss_mlp": 1.01664257, + "epoch": 0.5276258041243311, + "flos": 17749029098880.0, + "grad_norm": 1.8054861978937125, + "language_loss": 0.65553266, + "learning_rate": 1.9182307744312043e-06, + "loss": 0.67709893, + "num_input_tokens_seen": 94852235, + "step": 4388, + "time_per_iteration": 3.363699436187744 + }, + { + "auxiliary_loss_clip": 0.01150688, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04755569, + "balance_loss_mlp": 1.02365184, + "epoch": 0.5277460470149702, + "flos": 22710447077760.0, + "grad_norm": 2.4880963675929513, + "language_loss": 0.76714778, + "learning_rate": 1.9174524603236676e-06, + "loss": 0.78896612, + "num_input_tokens_seen": 94871185, + "step": 4389, + "time_per_iteration": 2.585955858230591 + }, + { + "auxiliary_loss_clip": 0.01151704, + "auxiliary_loss_mlp": 0.01025511, + "balance_loss_clip": 1.05048561, + "balance_loss_mlp": 1.01736891, + "epoch": 0.5278662899056094, + "flos": 19902699734400.0, + "grad_norm": 2.0773213747718327, + "language_loss": 0.75944352, + "learning_rate": 1.916674158738527e-06, + "loss": 0.78121567, + "num_input_tokens_seen": 94890090, + "step": 4390, + "time_per_iteration": 2.494558572769165 + }, + { + "auxiliary_loss_clip": 0.01130051, + "auxiliary_loss_mlp": 0.00763286, + "balance_loss_clip": 1.04871428, + "balance_loss_mlp": 1.00020909, + "epoch": 0.5279865327962484, + "flos": 18005215875840.0, + "grad_norm": 1.8242723600729802, + "language_loss": 0.60296273, + "learning_rate": 1.9158958697938506e-06, + "loss": 0.62189615, + "num_input_tokens_seen": 94908470, + "step": 4391, + "time_per_iteration": 2.5011210441589355 + }, + { + "auxiliary_loss_clip": 0.01142681, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.0460099, + "balance_loss_mlp": 1.0204463, + "epoch": 0.5281067756868875, + "flos": 15924443892480.0, + "grad_norm": 2.660231527652515, + "language_loss": 0.85814232, + "learning_rate": 1.9151175936077032e-06, + "loss": 0.87985301, + "num_input_tokens_seen": 94923440, + "step": 4392, + "time_per_iteration": 3.3119781017303467 + }, + { + "auxiliary_loss_clip": 0.01159666, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.05074072, + "balance_loss_mlp": 1.02007723, + "epoch": 0.5282270185775266, + "flos": 19426488197760.0, + "grad_norm": 1.6654065150361346, + "language_loss": 0.79473078, + "learning_rate": 1.9143393302981507e-06, + "loss": 0.81660873, + "num_input_tokens_seen": 94941125, + "step": 4393, + "time_per_iteration": 2.439785957336426 + }, + { + "auxiliary_loss_clip": 0.01150025, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.04806554, + "balance_loss_mlp": 1.01733208, + "epoch": 0.5283472614681657, + "flos": 16399613934720.0, + "grad_norm": 1.705385640586101, + "language_loss": 0.83006084, + "learning_rate": 1.913561079983252e-06, + "loss": 0.8518101, + "num_input_tokens_seen": 94959950, + "step": 4394, + "time_per_iteration": 3.2972543239593506 + }, + { + "auxiliary_loss_clip": 0.01153413, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.04833102, + "balance_loss_mlp": 1.02601802, + "epoch": 0.5284675043588047, + "flos": 26760524163840.0, + "grad_norm": 2.0889060258322156, + "language_loss": 0.74984896, + "learning_rate": 1.9127828427810693e-06, + "loss": 0.77173364, + "num_input_tokens_seen": 94980515, + "step": 4395, + "time_per_iteration": 2.5200607776641846 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.04810238, + "balance_loss_mlp": 1.02055538, + "epoch": 0.5285877472494439, + "flos": 19899898473600.0, + "grad_norm": 2.9597928023796944, + "language_loss": 0.81171829, + "learning_rate": 1.9120046188096607e-06, + "loss": 0.83342957, + "num_input_tokens_seen": 94998560, + "step": 4396, + "time_per_iteration": 2.5485148429870605 + }, + { + "auxiliary_loss_clip": 0.01151945, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.05396557, + "balance_loss_mlp": 1.02854121, + "epoch": 0.528707990140083, + "flos": 20011257613440.0, + "grad_norm": 1.9496307115972396, + "language_loss": 0.741799, + "learning_rate": 1.9112264081870804e-06, + "loss": 0.76368219, + "num_input_tokens_seen": 95016950, + "step": 4397, + "time_per_iteration": 2.489238739013672 + }, + { + "auxiliary_loss_clip": 0.01137155, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.05250216, + "balance_loss_mlp": 1.02244473, + "epoch": 0.528828233030722, + "flos": 20667956014080.0, + "grad_norm": 2.504448705693279, + "language_loss": 0.75749385, + "learning_rate": 1.9104482110313843e-06, + "loss": 0.77917469, + "num_input_tokens_seen": 95036540, + "step": 4398, + "time_per_iteration": 3.306952953338623 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.05063009, + "balance_loss_mlp": 1.01820564, + "epoch": 0.5289484759213612, + "flos": 25192448956800.0, + "grad_norm": 1.8535938708892494, + "language_loss": 0.73992157, + "learning_rate": 1.909670027460623e-06, + "loss": 0.76179206, + "num_input_tokens_seen": 95053840, + "step": 4399, + "time_per_iteration": 2.4948277473449707 + }, + { + "auxiliary_loss_clip": 0.01162441, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.05072093, + "balance_loss_mlp": 1.02026725, + "epoch": 0.5290687188120002, + "flos": 31139255715840.0, + "grad_norm": 1.7473423374003851, + "language_loss": 0.71797311, + "learning_rate": 1.908891857592847e-06, + "loss": 0.73987901, + "num_input_tokens_seen": 95074910, + "step": 4400, + "time_per_iteration": 2.608015298843384 + }, + { + "auxiliary_loss_clip": 0.01131708, + "auxiliary_loss_mlp": 0.01026469, + "balance_loss_clip": 1.0517652, + "balance_loss_mlp": 1.01796985, + "epoch": 0.5291889617026393, + "flos": 20119851406080.0, + "grad_norm": 2.2243282920012564, + "language_loss": 0.90379614, + "learning_rate": 1.9081137015461034e-06, + "loss": 0.92537796, + "num_input_tokens_seen": 95090985, + "step": 4401, + "time_per_iteration": 2.4960179328918457 + }, + { + "auxiliary_loss_clip": 0.01116025, + "auxiliary_loss_mlp": 0.0102671, + "balance_loss_clip": 1.04762197, + "balance_loss_mlp": 1.01910758, + "epoch": 0.5293092045932785, + "flos": 19643747610240.0, + "grad_norm": 2.049691496636775, + "language_loss": 0.90376449, + "learning_rate": 1.9073355594384383e-06, + "loss": 0.92519188, + "num_input_tokens_seen": 95109225, + "step": 4402, + "time_per_iteration": 2.527320146560669 + }, + { + "auxiliary_loss_clip": 0.01130218, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.05050611, + "balance_loss_mlp": 1.0251714, + "epoch": 0.5294294474839175, + "flos": 24317736958080.0, + "grad_norm": 1.9783832381434578, + "language_loss": 0.8036176, + "learning_rate": 1.906557431387895e-06, + "loss": 0.82525098, + "num_input_tokens_seen": 95128215, + "step": 4403, + "time_per_iteration": 2.5449538230895996 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.0544498, + "balance_loss_mlp": 1.02008843, + "epoch": 0.5295496903745566, + "flos": 18875941464960.0, + "grad_norm": 1.9139444703641642, + "language_loss": 0.78705049, + "learning_rate": 1.905779317512516e-06, + "loss": 0.80869007, + "num_input_tokens_seen": 95145760, + "step": 4404, + "time_per_iteration": 2.4876649379730225 + }, + { + "auxiliary_loss_clip": 0.01162042, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.05089378, + "balance_loss_mlp": 1.02075028, + "epoch": 0.5296699332651957, + "flos": 20923101296640.0, + "grad_norm": 3.3230698633396605, + "language_loss": 0.80456257, + "learning_rate": 1.9050012179303385e-06, + "loss": 0.82646787, + "num_input_tokens_seen": 95164270, + "step": 4405, + "time_per_iteration": 2.4593703746795654 + }, + { + "auxiliary_loss_clip": 0.01164464, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.04972339, + "balance_loss_mlp": 1.01764584, + "epoch": 0.5297901761558348, + "flos": 22046745525120.0, + "grad_norm": 2.572535409535337, + "language_loss": 0.68693566, + "learning_rate": 1.904223132759401e-06, + "loss": 0.70883584, + "num_input_tokens_seen": 95182870, + "step": 4406, + "time_per_iteration": 2.452558755874634 + }, + { + "auxiliary_loss_clip": 0.01166124, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.05229115, + "balance_loss_mlp": 1.01939964, + "epoch": 0.5299104190464738, + "flos": 21798495653760.0, + "grad_norm": 4.087952031227363, + "language_loss": 0.68952179, + "learning_rate": 1.9034450621177383e-06, + "loss": 0.71146154, + "num_input_tokens_seen": 95201190, + "step": 4407, + "time_per_iteration": 2.446545362472534 + }, + { + "auxiliary_loss_clip": 0.01162265, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.05196393, + "balance_loss_mlp": 1.02988458, + "epoch": 0.530030661937113, + "flos": 14720790119040.0, + "grad_norm": 2.0279941268381005, + "language_loss": 0.70698452, + "learning_rate": 1.9026670061233824e-06, + "loss": 0.72899115, + "num_input_tokens_seen": 95218625, + "step": 4408, + "time_per_iteration": 2.41420578956604 + }, + { + "auxiliary_loss_clip": 0.01144505, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.05098414, + "balance_loss_mlp": 1.01783323, + "epoch": 0.5301509048277521, + "flos": 21251504367360.0, + "grad_norm": 1.7368374521502858, + "language_loss": 0.80731225, + "learning_rate": 1.901888964894365e-06, + "loss": 0.82901514, + "num_input_tokens_seen": 95237665, + "step": 4409, + "time_per_iteration": 2.4782917499542236 + }, + { + "auxiliary_loss_clip": 0.01178522, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.0519352, + "balance_loss_mlp": 1.02119994, + "epoch": 0.5302711477183911, + "flos": 25957058791680.0, + "grad_norm": 1.7546288430541206, + "language_loss": 0.67922366, + "learning_rate": 1.9011109385487134e-06, + "loss": 0.70129764, + "num_input_tokens_seen": 95258915, + "step": 4410, + "time_per_iteration": 2.444899320602417 + }, + { + "auxiliary_loss_clip": 0.01178073, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.05082726, + "balance_loss_mlp": 1.01964402, + "epoch": 0.5303913906090303, + "flos": 22273126992000.0, + "grad_norm": 2.7118171127523176, + "language_loss": 0.66615164, + "learning_rate": 1.900332927204454e-06, + "loss": 0.68821156, + "num_input_tokens_seen": 95277365, + "step": 4411, + "time_per_iteration": 2.4106783866882324 + }, + { + "auxiliary_loss_clip": 0.0115675, + "auxiliary_loss_mlp": 0.01023546, + "balance_loss_clip": 1.05109882, + "balance_loss_mlp": 1.01582718, + "epoch": 0.5305116334996693, + "flos": 24936010784640.0, + "grad_norm": 1.895793965911426, + "language_loss": 0.76777911, + "learning_rate": 1.8995549309796097e-06, + "loss": 0.78958213, + "num_input_tokens_seen": 95296670, + "step": 4412, + "time_per_iteration": 2.529984712600708 + }, + { + "auxiliary_loss_clip": 0.01171853, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.05387521, + "balance_loss_mlp": 1.02017188, + "epoch": 0.5306318763903084, + "flos": 20189338266240.0, + "grad_norm": 2.0649350378005105, + "language_loss": 0.76648295, + "learning_rate": 1.8987769499922028e-06, + "loss": 0.78848183, + "num_input_tokens_seen": 95315640, + "step": 4413, + "time_per_iteration": 2.4816269874572754 + }, + { + "auxiliary_loss_clip": 0.01162868, + "auxiliary_loss_mlp": 0.00762269, + "balance_loss_clip": 1.05118632, + "balance_loss_mlp": 1.00029254, + "epoch": 0.5307521192809476, + "flos": 20266366982400.0, + "grad_norm": 2.431334215031439, + "language_loss": 0.7075935, + "learning_rate": 1.897998984360252e-06, + "loss": 0.72684491, + "num_input_tokens_seen": 95334610, + "step": 4414, + "time_per_iteration": 2.458519220352173 + }, + { + "auxiliary_loss_clip": 0.01147173, + "auxiliary_loss_mlp": 0.0102515, + "balance_loss_clip": 1.05042887, + "balance_loss_mlp": 1.01741982, + "epoch": 0.5308723621715866, + "flos": 28844276976000.0, + "grad_norm": 1.653642362517932, + "language_loss": 0.78462231, + "learning_rate": 1.897221034201775e-06, + "loss": 0.80634552, + "num_input_tokens_seen": 95358350, + "step": 4415, + "time_per_iteration": 3.3944060802459717 + }, + { + "auxiliary_loss_clip": 0.01135287, + "auxiliary_loss_mlp": 0.01027561, + "balance_loss_clip": 1.04646444, + "balance_loss_mlp": 1.02037251, + "epoch": 0.5309926050622257, + "flos": 27457766040960.0, + "grad_norm": 1.5624555616937665, + "language_loss": 0.6680097, + "learning_rate": 1.8964430996347842e-06, + "loss": 0.68963814, + "num_input_tokens_seen": 95379900, + "step": 4416, + "time_per_iteration": 2.588054656982422 + }, + { + "auxiliary_loss_clip": 0.01148879, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.04925323, + "balance_loss_mlp": 1.01759923, + "epoch": 0.5311128479528648, + "flos": 20514545026560.0, + "grad_norm": 3.7600022749501645, + "language_loss": 0.82547688, + "learning_rate": 1.8956651807772931e-06, + "loss": 0.84722322, + "num_input_tokens_seen": 95397935, + "step": 4417, + "time_per_iteration": 2.478142261505127 + }, + { + "auxiliary_loss_clip": 0.01160405, + "auxiliary_loss_mlp": 0.01022453, + "balance_loss_clip": 1.05012894, + "balance_loss_mlp": 1.01545537, + "epoch": 0.5312330908435039, + "flos": 21397660807680.0, + "grad_norm": 2.1443934624516383, + "language_loss": 0.83948964, + "learning_rate": 1.8948872777473115e-06, + "loss": 0.86131823, + "num_input_tokens_seen": 95415890, + "step": 4418, + "time_per_iteration": 2.4721696376800537 + }, + { + "auxiliary_loss_clip": 0.0114691, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.04902434, + "balance_loss_mlp": 1.02008104, + "epoch": 0.531353333734143, + "flos": 24717350741760.0, + "grad_norm": 1.8927535149723085, + "language_loss": 0.63758755, + "learning_rate": 1.8941093906628458e-06, + "loss": 0.65932882, + "num_input_tokens_seen": 95433675, + "step": 4419, + "time_per_iteration": 3.3488175868988037 + }, + { + "auxiliary_loss_clip": 0.01140728, + "auxiliary_loss_mlp": 0.01023722, + "balance_loss_clip": 1.04614508, + "balance_loss_mlp": 1.01649237, + "epoch": 0.531473576624782, + "flos": 30480689808000.0, + "grad_norm": 1.7407172442118777, + "language_loss": 0.70690119, + "learning_rate": 1.893331519641902e-06, + "loss": 0.72854573, + "num_input_tokens_seen": 95455820, + "step": 4420, + "time_per_iteration": 2.550868034362793 + }, + { + "auxiliary_loss_clip": 0.01122979, + "auxiliary_loss_mlp": 0.01027267, + "balance_loss_clip": 1.04337502, + "balance_loss_mlp": 1.01942933, + "epoch": 0.5315938195154212, + "flos": 23002975440000.0, + "grad_norm": 2.741893329887903, + "language_loss": 0.7380811, + "learning_rate": 1.8925536648024815e-06, + "loss": 0.75958359, + "num_input_tokens_seen": 95473240, + "step": 4421, + "time_per_iteration": 3.335223913192749 + }, + { + "auxiliary_loss_clip": 0.01177398, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.05210996, + "balance_loss_mlp": 1.01724613, + "epoch": 0.5317140624060602, + "flos": 22748584343040.0, + "grad_norm": 2.3155794764712474, + "language_loss": 0.75978935, + "learning_rate": 1.8917758262625849e-06, + "loss": 0.78181434, + "num_input_tokens_seen": 95493480, + "step": 4422, + "time_per_iteration": 2.426776885986328 + }, + { + "auxiliary_loss_clip": 0.01140854, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.04850554, + "balance_loss_mlp": 1.02175641, + "epoch": 0.5318343052966993, + "flos": 22821087945600.0, + "grad_norm": 1.6146765656286153, + "language_loss": 0.8065474, + "learning_rate": 1.8909980041402089e-06, + "loss": 0.82824844, + "num_input_tokens_seen": 95512075, + "step": 4423, + "time_per_iteration": 2.4910166263580322 + }, + { + "auxiliary_loss_clip": 0.01157005, + "auxiliary_loss_mlp": 0.0102723, + "balance_loss_clip": 1.04844546, + "balance_loss_mlp": 1.01927853, + "epoch": 0.5319545481873384, + "flos": 13626089274240.0, + "grad_norm": 2.1907451519042183, + "language_loss": 0.65487504, + "learning_rate": 1.8902201985533494e-06, + "loss": 0.67671734, + "num_input_tokens_seen": 95529340, + "step": 4424, + "time_per_iteration": 2.4017388820648193 + }, + { + "auxiliary_loss_clip": 0.01146654, + "auxiliary_loss_mlp": 0.01022627, + "balance_loss_clip": 1.04792094, + "balance_loss_mlp": 1.0155158, + "epoch": 0.5320747910779775, + "flos": 22162522037760.0, + "grad_norm": 1.8611239151710266, + "language_loss": 0.74934757, + "learning_rate": 1.8894424096199983e-06, + "loss": 0.77104038, + "num_input_tokens_seen": 95548545, + "step": 4425, + "time_per_iteration": 3.307497501373291 + }, + { + "auxiliary_loss_clip": 0.01165348, + "auxiliary_loss_mlp": 0.01026803, + "balance_loss_clip": 1.0534761, + "balance_loss_mlp": 1.01850557, + "epoch": 0.5321950339686166, + "flos": 18588081870720.0, + "grad_norm": 1.9229733687298485, + "language_loss": 0.85934412, + "learning_rate": 1.8886646374581463e-06, + "loss": 0.88126564, + "num_input_tokens_seen": 95567770, + "step": 4426, + "time_per_iteration": 2.428509473800659 + }, + { + "auxiliary_loss_clip": 0.01160504, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.04919577, + "balance_loss_mlp": 1.01886821, + "epoch": 0.5323152768592557, + "flos": 22856818999680.0, + "grad_norm": 1.8387797003995645, + "language_loss": 0.71122217, + "learning_rate": 1.8878868821857795e-06, + "loss": 0.73309469, + "num_input_tokens_seen": 95587420, + "step": 4427, + "time_per_iteration": 2.4547226428985596 + }, + { + "auxiliary_loss_clip": 0.01117063, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.04383039, + "balance_loss_mlp": 1.02111292, + "epoch": 0.5324355197498948, + "flos": 33948690998400.0, + "grad_norm": 2.541021035417326, + "language_loss": 0.751773, + "learning_rate": 1.8871091439208838e-06, + "loss": 0.77323759, + "num_input_tokens_seen": 95609030, + "step": 4428, + "time_per_iteration": 2.6807432174682617 + }, + { + "auxiliary_loss_clip": 0.01120159, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.04662943, + "balance_loss_mlp": 1.02164638, + "epoch": 0.5325557626405338, + "flos": 23256720092160.0, + "grad_norm": 2.148824894659783, + "language_loss": 0.77255893, + "learning_rate": 1.8863314227814414e-06, + "loss": 0.79406106, + "num_input_tokens_seen": 95627340, + "step": 4429, + "time_per_iteration": 2.5454797744750977 + }, + { + "auxiliary_loss_clip": 0.01169561, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.05370152, + "balance_loss_mlp": 1.01973116, + "epoch": 0.532676005531173, + "flos": 26718687797760.0, + "grad_norm": 2.203540722026896, + "language_loss": 0.48745137, + "learning_rate": 1.8855537188854313e-06, + "loss": 0.50942606, + "num_input_tokens_seen": 95646315, + "step": 4430, + "time_per_iteration": 2.4973628520965576 + }, + { + "auxiliary_loss_clip": 0.01163668, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.04846144, + "balance_loss_mlp": 1.02065754, + "epoch": 0.5327962484218121, + "flos": 17894610921600.0, + "grad_norm": 2.4094500990062464, + "language_loss": 0.78297079, + "learning_rate": 1.8847760323508315e-06, + "loss": 0.80489039, + "num_input_tokens_seen": 95665220, + "step": 4431, + "time_per_iteration": 2.4343512058258057 + }, + { + "auxiliary_loss_clip": 0.01143125, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.04928517, + "balance_loss_mlp": 1.01900911, + "epoch": 0.5329164913124511, + "flos": 17925385898880.0, + "grad_norm": 1.9279859207312136, + "language_loss": 0.7563687, + "learning_rate": 1.883998363295616e-06, + "loss": 0.77806056, + "num_input_tokens_seen": 95682700, + "step": 4432, + "time_per_iteration": 2.4565107822418213 + }, + { + "auxiliary_loss_clip": 0.01054434, + "auxiliary_loss_mlp": 0.0100109, + "balance_loss_clip": 1.01677632, + "balance_loss_mlp": 1.00011253, + "epoch": 0.5330367342030903, + "flos": 57254178781440.0, + "grad_norm": 0.87809699125139, + "language_loss": 0.62535334, + "learning_rate": 1.8832207118377565e-06, + "loss": 0.64590859, + "num_input_tokens_seen": 95738070, + "step": 4433, + "time_per_iteration": 2.9914746284484863 + }, + { + "auxiliary_loss_clip": 0.01173744, + "auxiliary_loss_mlp": 0.01024413, + "balance_loss_clip": 1.05066645, + "balance_loss_mlp": 1.01730776, + "epoch": 0.5331569770937293, + "flos": 17420518287360.0, + "grad_norm": 2.126230536835848, + "language_loss": 0.69381464, + "learning_rate": 1.882443078095222e-06, + "loss": 0.71579623, + "num_input_tokens_seen": 95756950, + "step": 4434, + "time_per_iteration": 2.394221067428589 + }, + { + "auxiliary_loss_clip": 0.01043321, + "auxiliary_loss_mlp": 0.01001025, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.00010157, + "epoch": 0.5332772199843684, + "flos": 56750783627520.0, + "grad_norm": 1.1329709108492303, + "language_loss": 0.6684711, + "learning_rate": 1.8816654621859794e-06, + "loss": 0.68891454, + "num_input_tokens_seen": 95816615, + "step": 4435, + "time_per_iteration": 3.0088424682617188 + }, + { + "auxiliary_loss_clip": 0.01174566, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.05182791, + "balance_loss_mlp": 1.01775885, + "epoch": 0.5333974628750076, + "flos": 18697753071360.0, + "grad_norm": 2.1951282833747707, + "language_loss": 0.72302198, + "learning_rate": 1.8808878642279915e-06, + "loss": 0.74502504, + "num_input_tokens_seen": 95832020, + "step": 4436, + "time_per_iteration": 2.408475399017334 + }, + { + "auxiliary_loss_clip": 0.0113391, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.0433085, + "balance_loss_mlp": 1.02476478, + "epoch": 0.5335177057656466, + "flos": 23805507058560.0, + "grad_norm": 2.454058446815523, + "language_loss": 0.6479131, + "learning_rate": 1.8801102843392209e-06, + "loss": 0.6695807, + "num_input_tokens_seen": 95851425, + "step": 4437, + "time_per_iteration": 2.5382144451141357 + }, + { + "auxiliary_loss_clip": 0.01132168, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.0454185, + "balance_loss_mlp": 1.02004147, + "epoch": 0.5336379486562857, + "flos": 25078683605760.0, + "grad_norm": 1.611145090353668, + "language_loss": 0.8530699, + "learning_rate": 1.8793327226376238e-06, + "loss": 0.87466645, + "num_input_tokens_seen": 95870745, + "step": 4438, + "time_per_iteration": 2.5607376098632812 + }, + { + "auxiliary_loss_clip": 0.01155704, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.04951477, + "balance_loss_mlp": 1.0193429, + "epoch": 0.5337581915469248, + "flos": 21396691140480.0, + "grad_norm": 1.7876418179343254, + "language_loss": 0.80082142, + "learning_rate": 1.8785551792411569e-06, + "loss": 0.82264841, + "num_input_tokens_seen": 95889755, + "step": 4439, + "time_per_iteration": 2.4819371700286865 + }, + { + "auxiliary_loss_clip": 0.01147436, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.04863691, + "balance_loss_mlp": 1.0216918, + "epoch": 0.5338784344375639, + "flos": 14865905064960.0, + "grad_norm": 2.0018118718720026, + "language_loss": 0.82300633, + "learning_rate": 1.8777776542677733e-06, + "loss": 0.84476757, + "num_input_tokens_seen": 95907805, + "step": 4440, + "time_per_iteration": 2.4573745727539062 + }, + { + "auxiliary_loss_clip": 0.01132546, + "auxiliary_loss_mlp": 0.01023086, + "balance_loss_clip": 1.0441972, + "balance_loss_mlp": 1.01518857, + "epoch": 0.5339986773282029, + "flos": 20813501923200.0, + "grad_norm": 1.9381061106172475, + "language_loss": 0.73208892, + "learning_rate": 1.8770001478354216e-06, + "loss": 0.7536453, + "num_input_tokens_seen": 95927480, + "step": 4441, + "time_per_iteration": 2.5188307762145996 + }, + { + "auxiliary_loss_clip": 0.01157611, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.04856658, + "balance_loss_mlp": 1.02410674, + "epoch": 0.5341189202188421, + "flos": 17969089772160.0, + "grad_norm": 2.334046273550115, + "language_loss": 0.84375846, + "learning_rate": 1.8762226600620504e-06, + "loss": 0.86565804, + "num_input_tokens_seen": 95946095, + "step": 4442, + "time_per_iteration": 3.258622407913208 + }, + { + "auxiliary_loss_clip": 0.0115465, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.04857564, + "balance_loss_mlp": 1.01852918, + "epoch": 0.5342391631094812, + "flos": 11031866328960.0, + "grad_norm": 3.895016297064507, + "language_loss": 0.58954144, + "learning_rate": 1.8754451910656031e-06, + "loss": 0.61135733, + "num_input_tokens_seen": 95959995, + "step": 4443, + "time_per_iteration": 2.4188201427459717 + }, + { + "auxiliary_loss_clip": 0.01128019, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.04714465, + "balance_loss_mlp": 1.0186522, + "epoch": 0.5343594060001202, + "flos": 15339135772800.0, + "grad_norm": 2.0982498329638704, + "language_loss": 0.82721788, + "learning_rate": 1.8746677409640212e-06, + "loss": 0.84876502, + "num_input_tokens_seen": 95977095, + "step": 4444, + "time_per_iteration": 2.5219223499298096 + }, + { + "auxiliary_loss_clip": 0.01167921, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.05392075, + "balance_loss_mlp": 1.02075338, + "epoch": 0.5344796488907594, + "flos": 26900898514560.0, + "grad_norm": 1.7665490761804004, + "language_loss": 0.84337252, + "learning_rate": 1.8738903098752432e-06, + "loss": 0.86533576, + "num_input_tokens_seen": 95996225, + "step": 4445, + "time_per_iteration": 2.4985456466674805 + }, + { + "auxiliary_loss_clip": 0.01148807, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.04945087, + "balance_loss_mlp": 1.02342033, + "epoch": 0.5345998917813984, + "flos": 25411216740480.0, + "grad_norm": 2.271288353522816, + "language_loss": 0.73147124, + "learning_rate": 1.8731128979172052e-06, + "loss": 0.75326788, + "num_input_tokens_seen": 96015425, + "step": 4446, + "time_per_iteration": 3.371224880218506 + }, + { + "auxiliary_loss_clip": 0.01145833, + "auxiliary_loss_mlp": 0.01022238, + "balance_loss_clip": 1.04989493, + "balance_loss_mlp": 1.01518083, + "epoch": 0.5347201346720375, + "flos": 32853379622400.0, + "grad_norm": 2.1882611576014437, + "language_loss": 0.66941226, + "learning_rate": 1.8723355052078394e-06, + "loss": 0.69109297, + "num_input_tokens_seen": 96035460, + "step": 4447, + "time_per_iteration": 2.618471384048462 + }, + { + "auxiliary_loss_clip": 0.01159026, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.04825974, + "balance_loss_mlp": 1.02736092, + "epoch": 0.5348403775626767, + "flos": 17967940536960.0, + "grad_norm": 2.674220123911979, + "language_loss": 0.77289617, + "learning_rate": 1.8715581318650765e-06, + "loss": 0.79484028, + "num_input_tokens_seen": 96054515, + "step": 4448, + "time_per_iteration": 3.2570197582244873 + }, + { + "auxiliary_loss_clip": 0.011453, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.05004001, + "balance_loss_mlp": 1.02024376, + "epoch": 0.5349606204533157, + "flos": 17603339535360.0, + "grad_norm": 2.2044857888588596, + "language_loss": 0.81117606, + "learning_rate": 1.8707807780068422e-06, + "loss": 0.83292067, + "num_input_tokens_seen": 96072330, + "step": 4449, + "time_per_iteration": 2.5061838626861572 + }, + { + "auxiliary_loss_clip": 0.01144826, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.04815948, + "balance_loss_mlp": 1.01901841, + "epoch": 0.5350808633439548, + "flos": 29167831710720.0, + "grad_norm": 2.551763531400763, + "language_loss": 0.66170347, + "learning_rate": 1.8700034437510611e-06, + "loss": 0.68341517, + "num_input_tokens_seen": 96092425, + "step": 4450, + "time_per_iteration": 2.5877599716186523 + }, + { + "auxiliary_loss_clip": 0.01125631, + "auxiliary_loss_mlp": 0.01025652, + "balance_loss_clip": 1.0466001, + "balance_loss_mlp": 1.01777864, + "epoch": 0.5352011062345938, + "flos": 19499997381120.0, + "grad_norm": 2.2407233072706485, + "language_loss": 0.81731796, + "learning_rate": 1.8692261292156549e-06, + "loss": 0.83883083, + "num_input_tokens_seen": 96111660, + "step": 4451, + "time_per_iteration": 2.520409345626831 + }, + { + "auxiliary_loss_clip": 0.01176956, + "auxiliary_loss_mlp": 0.01024901, + "balance_loss_clip": 1.054739, + "balance_loss_mlp": 1.01735544, + "epoch": 0.535321349125233, + "flos": 23477642691840.0, + "grad_norm": 1.9449492563437474, + "language_loss": 0.81255102, + "learning_rate": 1.8684488345185401e-06, + "loss": 0.83456957, + "num_input_tokens_seen": 96131835, + "step": 4452, + "time_per_iteration": 3.1671388149261475 + }, + { + "auxiliary_loss_clip": 0.01178818, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.05379868, + "balance_loss_mlp": 1.02142, + "epoch": 0.535441592015872, + "flos": 20478059786880.0, + "grad_norm": 2.289697788667961, + "language_loss": 0.78501272, + "learning_rate": 1.8676715597776332e-06, + "loss": 0.80709332, + "num_input_tokens_seen": 96150180, + "step": 4453, + "time_per_iteration": 2.398439645767212 + }, + { + "auxiliary_loss_clip": 0.01110919, + "auxiliary_loss_mlp": 0.0102319, + "balance_loss_clip": 1.04223621, + "balance_loss_mlp": 1.01565957, + "epoch": 0.5355618349065111, + "flos": 19573147428480.0, + "grad_norm": 1.807117462478234, + "language_loss": 0.76476395, + "learning_rate": 1.8668943051108455e-06, + "loss": 0.78610504, + "num_input_tokens_seen": 96167485, + "step": 4454, + "time_per_iteration": 2.528853416442871 + }, + { + "auxiliary_loss_clip": 0.01146749, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.04812694, + "balance_loss_mlp": 1.0226028, + "epoch": 0.5356820777971503, + "flos": 24024633978240.0, + "grad_norm": 1.8669990840015993, + "language_loss": 0.76142764, + "learning_rate": 1.8661170706360856e-06, + "loss": 0.7832002, + "num_input_tokens_seen": 96186650, + "step": 4455, + "time_per_iteration": 2.507431745529175 + }, + { + "auxiliary_loss_clip": 0.0116157, + "auxiliary_loss_mlp": 0.01022897, + "balance_loss_clip": 1.0519228, + "balance_loss_mlp": 1.01577687, + "epoch": 0.5358023206877893, + "flos": 20884676722560.0, + "grad_norm": 1.6291379145055414, + "language_loss": 0.81508356, + "learning_rate": 1.8653398564712594e-06, + "loss": 0.83692825, + "num_input_tokens_seen": 96205595, + "step": 4456, + "time_per_iteration": 2.446989059448242 + }, + { + "auxiliary_loss_clip": 0.01160343, + "auxiliary_loss_mlp": 0.01024168, + "balance_loss_clip": 1.05177546, + "balance_loss_mlp": 1.01643705, + "epoch": 0.5359225635784284, + "flos": 22418996123520.0, + "grad_norm": 1.5592992000963437, + "language_loss": 0.82007861, + "learning_rate": 1.8645626627342704e-06, + "loss": 0.84192371, + "num_input_tokens_seen": 96226360, + "step": 4457, + "time_per_iteration": 2.4673218727111816 + }, + { + "auxiliary_loss_clip": 0.01165079, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.05072236, + "balance_loss_mlp": 1.02073061, + "epoch": 0.5360428064690675, + "flos": 24097784025600.0, + "grad_norm": 2.2760354984773654, + "language_loss": 0.80978495, + "learning_rate": 1.8637854895430172e-06, + "loss": 0.83171803, + "num_input_tokens_seen": 96245625, + "step": 4458, + "time_per_iteration": 2.476638078689575 + }, + { + "auxiliary_loss_clip": 0.01127204, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.04690289, + "balance_loss_mlp": 1.02123857, + "epoch": 0.5361630493597066, + "flos": 21434505183360.0, + "grad_norm": 2.137333711046165, + "language_loss": 0.69852883, + "learning_rate": 1.8630083370153978e-06, + "loss": 0.72009528, + "num_input_tokens_seen": 96265265, + "step": 4459, + "time_per_iteration": 2.5222668647766113 + }, + { + "auxiliary_loss_clip": 0.01028763, + "auxiliary_loss_mlp": 0.01001179, + "balance_loss_clip": 1.01792812, + "balance_loss_mlp": 1.00026083, + "epoch": 0.5362832922503457, + "flos": 68888696520960.0, + "grad_norm": 0.7515503345002652, + "language_loss": 0.55431956, + "learning_rate": 1.8622312052693041e-06, + "loss": 0.57461894, + "num_input_tokens_seen": 96326445, + "step": 4460, + "time_per_iteration": 3.224353075027466 + }, + { + "auxiliary_loss_clip": 0.01152905, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.04578114, + "balance_loss_mlp": 1.01929271, + "epoch": 0.5364035351409848, + "flos": 9793702563840.0, + "grad_norm": 2.3747290463549957, + "language_loss": 0.72064066, + "learning_rate": 1.8614540944226267e-06, + "loss": 0.74243867, + "num_input_tokens_seen": 96343115, + "step": 4461, + "time_per_iteration": 2.4199695587158203 + }, + { + "auxiliary_loss_clip": 0.01143205, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.04959953, + "balance_loss_mlp": 1.01750493, + "epoch": 0.5365237780316239, + "flos": 23290080848640.0, + "grad_norm": 1.8957466065967739, + "language_loss": 0.67709768, + "learning_rate": 1.8606770045932537e-06, + "loss": 0.69877422, + "num_input_tokens_seen": 96362230, + "step": 4462, + "time_per_iteration": 2.5026862621307373 + }, + { + "auxiliary_loss_clip": 0.01126294, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.04264879, + "balance_loss_mlp": 1.02093208, + "epoch": 0.5366440209222629, + "flos": 26578133879040.0, + "grad_norm": 2.385155724961322, + "language_loss": 0.81893027, + "learning_rate": 1.859899935899068e-06, + "loss": 0.84048671, + "num_input_tokens_seen": 96382085, + "step": 4463, + "time_per_iteration": 2.5726125240325928 + }, + { + "auxiliary_loss_clip": 0.01148911, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.05319619, + "balance_loss_mlp": 1.01915979, + "epoch": 0.5367642638129021, + "flos": 19608052469760.0, + "grad_norm": 1.5633399524107081, + "language_loss": 0.7887969, + "learning_rate": 1.8591228884579506e-06, + "loss": 0.81055772, + "num_input_tokens_seen": 96400580, + "step": 4464, + "time_per_iteration": 2.474151849746704 + }, + { + "auxiliary_loss_clip": 0.01137231, + "auxiliary_loss_mlp": 0.01026211, + "balance_loss_clip": 1.04786599, + "balance_loss_mlp": 1.01870084, + "epoch": 0.5368845067035412, + "flos": 23915214172800.0, + "grad_norm": 2.001505398609328, + "language_loss": 0.82025689, + "learning_rate": 1.8583458623877795e-06, + "loss": 0.84189129, + "num_input_tokens_seen": 96419680, + "step": 4465, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.0102476, + "balance_loss_clip": 1.05115592, + "balance_loss_mlp": 1.01730335, + "epoch": 0.5370047495941802, + "flos": 16873131951360.0, + "grad_norm": 1.7100307378037336, + "language_loss": 0.74166769, + "learning_rate": 1.8575688578064281e-06, + "loss": 0.76355052, + "num_input_tokens_seen": 96437805, + "step": 4466, + "time_per_iteration": 2.419611692428589 + }, + { + "auxiliary_loss_clip": 0.0116479, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.05205464, + "balance_loss_mlp": 1.02023792, + "epoch": 0.5371249924848194, + "flos": 20740926493440.0, + "grad_norm": 1.6992476666578011, + "language_loss": 0.76564705, + "learning_rate": 1.8567918748317674e-06, + "loss": 0.78757358, + "num_input_tokens_seen": 96457155, + "step": 4467, + "time_per_iteration": 2.457407236099243 + }, + { + "auxiliary_loss_clip": 0.01131867, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.04425859, + "balance_loss_mlp": 1.0196259, + "epoch": 0.5372452353754584, + "flos": 17968120104960.0, + "grad_norm": 2.164692405242359, + "language_loss": 0.82604158, + "learning_rate": 1.8560149135816659e-06, + "loss": 0.84763443, + "num_input_tokens_seen": 96473990, + "step": 4468, + "time_per_iteration": 3.3103084564208984 + }, + { + "auxiliary_loss_clip": 0.01156866, + "auxiliary_loss_mlp": 0.01021639, + "balance_loss_clip": 1.04751825, + "balance_loss_mlp": 1.01446533, + "epoch": 0.5373654782660975, + "flos": 15377021642880.0, + "grad_norm": 2.1594839537498522, + "language_loss": 0.84159189, + "learning_rate": 1.8552379741739873e-06, + "loss": 0.86337698, + "num_input_tokens_seen": 96491335, + "step": 4469, + "time_per_iteration": 2.421666383743286 + }, + { + "auxiliary_loss_clip": 0.01045487, + "auxiliary_loss_mlp": 0.00752661, + "balance_loss_clip": 1.01706469, + "balance_loss_mlp": 0.99971449, + "epoch": 0.5374857211567367, + "flos": 69000091574400.0, + "grad_norm": 0.8942068885640814, + "language_loss": 0.55684155, + "learning_rate": 1.8544610567265935e-06, + "loss": 0.57482308, + "num_input_tokens_seen": 96545275, + "step": 4470, + "time_per_iteration": 3.03814959526062 + }, + { + "auxiliary_loss_clip": 0.0114831, + "auxiliary_loss_mlp": 0.00761958, + "balance_loss_clip": 1.05115497, + "balance_loss_mlp": 1.00026214, + "epoch": 0.5376059640473757, + "flos": 15085355207040.0, + "grad_norm": 1.9626685817636675, + "language_loss": 0.83195674, + "learning_rate": 1.853684161357341e-06, + "loss": 0.85105944, + "num_input_tokens_seen": 96562935, + "step": 4471, + "time_per_iteration": 2.464038133621216 + }, + { + "auxiliary_loss_clip": 0.01159627, + "auxiliary_loss_mlp": 0.0076238, + "balance_loss_clip": 1.05127382, + "balance_loss_mlp": 1.00029433, + "epoch": 0.5377262069380148, + "flos": 19792597570560.0, + "grad_norm": 1.7383531331988016, + "language_loss": 0.76809514, + "learning_rate": 1.852907288184085e-06, + "loss": 0.78731519, + "num_input_tokens_seen": 96581820, + "step": 4472, + "time_per_iteration": 2.4673213958740234 + }, + { + "auxiliary_loss_clip": 0.01123999, + "auxiliary_loss_mlp": 0.01027439, + "balance_loss_clip": 1.04639482, + "balance_loss_mlp": 1.01884377, + "epoch": 0.5378464498286539, + "flos": 30003077640960.0, + "grad_norm": 1.8400786012806205, + "language_loss": 0.70284921, + "learning_rate": 1.8521304373246762e-06, + "loss": 0.72436363, + "num_input_tokens_seen": 96602865, + "step": 4473, + "time_per_iteration": 3.4835758209228516 + }, + { + "auxiliary_loss_clip": 0.01165097, + "auxiliary_loss_mlp": 0.01025631, + "balance_loss_clip": 1.04973698, + "balance_loss_mlp": 1.01779938, + "epoch": 0.537966692719293, + "flos": 21251217058560.0, + "grad_norm": 2.506134031203014, + "language_loss": 0.88833678, + "learning_rate": 1.8513536088969626e-06, + "loss": 0.91024411, + "num_input_tokens_seen": 96620530, + "step": 4474, + "time_per_iteration": 3.2373931407928467 + }, + { + "auxiliary_loss_clip": 0.01164577, + "auxiliary_loss_mlp": 0.0103712, + "balance_loss_clip": 1.05295444, + "balance_loss_mlp": 1.02890062, + "epoch": 0.538086935609932, + "flos": 21543170803200.0, + "grad_norm": 2.1503063923741546, + "language_loss": 0.80407178, + "learning_rate": 1.8505768030187884e-06, + "loss": 0.82608879, + "num_input_tokens_seen": 96640660, + "step": 4475, + "time_per_iteration": 2.4650754928588867 + }, + { + "auxiliary_loss_clip": 0.01144412, + "auxiliary_loss_mlp": 0.01025949, + "balance_loss_clip": 1.05049706, + "balance_loss_mlp": 1.01882017, + "epoch": 0.5382071785005712, + "flos": 22747219626240.0, + "grad_norm": 1.7119117056191293, + "language_loss": 0.8012085, + "learning_rate": 1.849800019807995e-06, + "loss": 0.8229121, + "num_input_tokens_seen": 96661885, + "step": 4476, + "time_per_iteration": 2.526031970977783 + }, + { + "auxiliary_loss_clip": 0.0113173, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.04729128, + "balance_loss_mlp": 1.01858521, + "epoch": 0.5383274213912103, + "flos": 24934574240640.0, + "grad_norm": 2.210722727540462, + "language_loss": 0.70869958, + "learning_rate": 1.8490232593824186e-06, + "loss": 0.73027825, + "num_input_tokens_seen": 96678340, + "step": 4477, + "time_per_iteration": 2.5267958641052246 + }, + { + "auxiliary_loss_clip": 0.01147608, + "auxiliary_loss_mlp": 0.01026257, + "balance_loss_clip": 1.05252624, + "balance_loss_mlp": 1.01952422, + "epoch": 0.5384476642818493, + "flos": 22310186849280.0, + "grad_norm": 1.763550013622752, + "language_loss": 0.84696341, + "learning_rate": 1.8482465218598935e-06, + "loss": 0.86870199, + "num_input_tokens_seen": 96698285, + "step": 4478, + "time_per_iteration": 3.2263917922973633 + }, + { + "auxiliary_loss_clip": 0.01133589, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.04629946, + "balance_loss_mlp": 1.01925039, + "epoch": 0.5385679071724885, + "flos": 22711021695360.0, + "grad_norm": 1.8017459716605533, + "language_loss": 0.83167791, + "learning_rate": 1.8474698073582508e-06, + "loss": 0.85328817, + "num_input_tokens_seen": 96719655, + "step": 4479, + "time_per_iteration": 2.5472662448883057 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01021095, + "balance_loss_clip": 1.04722977, + "balance_loss_mlp": 1.01362062, + "epoch": 0.5386881500631275, + "flos": 15953746412160.0, + "grad_norm": 1.9907662895863352, + "language_loss": 0.87314087, + "learning_rate": 1.8466931159953166e-06, + "loss": 0.89474022, + "num_input_tokens_seen": 96736290, + "step": 4480, + "time_per_iteration": 2.486788034439087 + }, + { + "auxiliary_loss_clip": 0.01153111, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.05212665, + "balance_loss_mlp": 1.02114248, + "epoch": 0.5388083929537666, + "flos": 24060041809920.0, + "grad_norm": 2.394991686298086, + "language_loss": 0.84316683, + "learning_rate": 1.8459164478889158e-06, + "loss": 0.86498547, + "num_input_tokens_seen": 96757685, + "step": 4481, + "time_per_iteration": 2.5128936767578125 + }, + { + "auxiliary_loss_clip": 0.01127108, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.04517388, + "balance_loss_mlp": 1.01647949, + "epoch": 0.5389286358444056, + "flos": 22236893147520.0, + "grad_norm": 1.875004365145341, + "language_loss": 0.76047671, + "learning_rate": 1.8451398031568663e-06, + "loss": 0.78198576, + "num_input_tokens_seen": 96777310, + "step": 4482, + "time_per_iteration": 2.516167640686035 + }, + { + "auxiliary_loss_clip": 0.01134465, + "auxiliary_loss_mlp": 0.01024904, + "balance_loss_clip": 1.04816866, + "balance_loss_mlp": 1.01709616, + "epoch": 0.5390488787350448, + "flos": 24281718595200.0, + "grad_norm": 1.7031201848368782, + "language_loss": 0.74754083, + "learning_rate": 1.844363181916986e-06, + "loss": 0.76913452, + "num_input_tokens_seen": 96798035, + "step": 4483, + "time_per_iteration": 2.5668463706970215 + }, + { + "auxiliary_loss_clip": 0.01159778, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.04962683, + "balance_loss_mlp": 1.02264643, + "epoch": 0.5391691216256839, + "flos": 16581393688320.0, + "grad_norm": 2.250537525079556, + "language_loss": 0.83371311, + "learning_rate": 1.8435865842870868e-06, + "loss": 0.85561067, + "num_input_tokens_seen": 96815975, + "step": 4484, + "time_per_iteration": 2.437711477279663 + }, + { + "auxiliary_loss_clip": 0.01138163, + "auxiliary_loss_mlp": 0.0076231, + "balance_loss_clip": 1.0449276, + "balance_loss_mlp": 1.00037479, + "epoch": 0.5392893645163229, + "flos": 23330049707520.0, + "grad_norm": 2.0537061908818086, + "language_loss": 0.71873277, + "learning_rate": 1.8428100103849787e-06, + "loss": 0.73773748, + "num_input_tokens_seen": 96835770, + "step": 4485, + "time_per_iteration": 2.5080618858337402 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.05330825, + "balance_loss_mlp": 1.02337241, + "epoch": 0.5394096074069621, + "flos": 15669801400320.0, + "grad_norm": 2.4248632676515145, + "language_loss": 0.73307145, + "learning_rate": 1.842033460328467e-06, + "loss": 0.75488675, + "num_input_tokens_seen": 96854490, + "step": 4486, + "time_per_iteration": 2.455778121948242 + }, + { + "auxiliary_loss_clip": 0.01150211, + "auxiliary_loss_mlp": 0.00761969, + "balance_loss_clip": 1.04753304, + "balance_loss_mlp": 1.0003593, + "epoch": 0.5395298502976011, + "flos": 22893447893760.0, + "grad_norm": 1.6631617719100145, + "language_loss": 0.75173771, + "learning_rate": 1.8412569342353541e-06, + "loss": 0.77085948, + "num_input_tokens_seen": 96874645, + "step": 4487, + "time_per_iteration": 2.5174357891082764 + }, + { + "auxiliary_loss_clip": 0.01153241, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.05108953, + "balance_loss_mlp": 1.02436388, + "epoch": 0.5396500931882402, + "flos": 23842135952640.0, + "grad_norm": 1.9005491259605878, + "language_loss": 0.84796524, + "learning_rate": 1.840480432223438e-06, + "loss": 0.86982417, + "num_input_tokens_seen": 96893650, + "step": 4488, + "time_per_iteration": 2.4916694164276123 + }, + { + "auxiliary_loss_clip": 0.01148741, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.04751098, + "balance_loss_mlp": 1.02270222, + "epoch": 0.5397703360788794, + "flos": 26322988596480.0, + "grad_norm": 2.205160208525085, + "language_loss": 0.77600622, + "learning_rate": 1.8397039544105131e-06, + "loss": 0.79779655, + "num_input_tokens_seen": 96912735, + "step": 4489, + "time_per_iteration": 2.534141778945923 + }, + { + "auxiliary_loss_clip": 0.01142902, + "auxiliary_loss_mlp": 0.01024235, + "balance_loss_clip": 1.04639983, + "balance_loss_mlp": 1.01648653, + "epoch": 0.5398905789695184, + "flos": 21214588164480.0, + "grad_norm": 2.154275094202878, + "language_loss": 0.69819635, + "learning_rate": 1.8389275009143711e-06, + "loss": 0.71986771, + "num_input_tokens_seen": 96932475, + "step": 4490, + "time_per_iteration": 2.4854838848114014 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.05064297, + "balance_loss_mlp": 1.01840913, + "epoch": 0.5400108218601575, + "flos": 25080335631360.0, + "grad_norm": 1.8663062078000634, + "language_loss": 0.73665971, + "learning_rate": 1.8381510718527988e-06, + "loss": 0.75864476, + "num_input_tokens_seen": 96952085, + "step": 4491, + "time_per_iteration": 2.4466052055358887 + }, + { + "auxiliary_loss_clip": 0.01151174, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.04742694, + "balance_loss_mlp": 1.02110505, + "epoch": 0.5401310647507966, + "flos": 26357498588160.0, + "grad_norm": 2.5472658261863463, + "language_loss": 0.63529658, + "learning_rate": 1.8373746673435812e-06, + "loss": 0.65709984, + "num_input_tokens_seen": 96973110, + "step": 4492, + "time_per_iteration": 2.532409191131592 + }, + { + "auxiliary_loss_clip": 0.01179352, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.05430782, + "balance_loss_mlp": 1.02051091, + "epoch": 0.5402513076414357, + "flos": 27855332749440.0, + "grad_norm": 1.661301080479555, + "language_loss": 0.79172289, + "learning_rate": 1.8365982875044964e-06, + "loss": 0.81380063, + "num_input_tokens_seen": 96993420, + "step": 4493, + "time_per_iteration": 2.4797379970550537 + }, + { + "auxiliary_loss_clip": 0.01169397, + "auxiliary_loss_mlp": 0.00762604, + "balance_loss_clip": 1.05331743, + "balance_loss_mlp": 1.00028503, + "epoch": 0.5403715505320748, + "flos": 22893771116160.0, + "grad_norm": 1.9877258192457343, + "language_loss": 0.7570402, + "learning_rate": 1.8358219324533217e-06, + "loss": 0.77636015, + "num_input_tokens_seen": 97013685, + "step": 4494, + "time_per_iteration": 2.471734046936035 + }, + { + "auxiliary_loss_clip": 0.01143205, + "auxiliary_loss_mlp": 0.01025245, + "balance_loss_clip": 1.04836488, + "balance_loss_mlp": 1.01842606, + "epoch": 0.5404917934227139, + "flos": 30224143895040.0, + "grad_norm": 1.875463548352243, + "language_loss": 0.70485508, + "learning_rate": 1.8350456023078292e-06, + "loss": 0.72653961, + "num_input_tokens_seen": 97036060, + "step": 4495, + "time_per_iteration": 3.3996193408966064 + }, + { + "auxiliary_loss_clip": 0.01180751, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.05342209, + "balance_loss_mlp": 1.02522516, + "epoch": 0.540612036313353, + "flos": 19938502615680.0, + "grad_norm": 2.780571036390765, + "language_loss": 0.78138185, + "learning_rate": 1.8342692971857874e-06, + "loss": 0.80352569, + "num_input_tokens_seen": 97055260, + "step": 4496, + "time_per_iteration": 2.4275875091552734 + }, + { + "auxiliary_loss_clip": 0.01147921, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.05072308, + "balance_loss_mlp": 1.02148688, + "epoch": 0.540732279203992, + "flos": 24279599692800.0, + "grad_norm": 2.1974390425763413, + "language_loss": 0.71137136, + "learning_rate": 1.833493017204962e-06, + "loss": 0.73314023, + "num_input_tokens_seen": 97075365, + "step": 4497, + "time_per_iteration": 2.558467149734497 + }, + { + "auxiliary_loss_clip": 0.01176532, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.05226088, + "balance_loss_mlp": 1.02559316, + "epoch": 0.5408525220946312, + "flos": 20193216935040.0, + "grad_norm": 1.8664053892363894, + "language_loss": 0.78216189, + "learning_rate": 1.8327167624831134e-06, + "loss": 0.80425823, + "num_input_tokens_seen": 97093095, + "step": 4498, + "time_per_iteration": 2.4242374897003174 + }, + { + "auxiliary_loss_clip": 0.01174873, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.05251908, + "balance_loss_mlp": 1.01994157, + "epoch": 0.5409727649852702, + "flos": 24134448833280.0, + "grad_norm": 1.5840676542609582, + "language_loss": 0.71054977, + "learning_rate": 1.831940533137999e-06, + "loss": 0.73256993, + "num_input_tokens_seen": 97112000, + "step": 4499, + "time_per_iteration": 3.3257219791412354 + }, + { + "auxiliary_loss_clip": 0.01161919, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.05461526, + "balance_loss_mlp": 1.01951051, + "epoch": 0.5410930078759093, + "flos": 23912700220800.0, + "grad_norm": 1.6962871266940485, + "language_loss": 0.72297829, + "learning_rate": 1.8311643292873718e-06, + "loss": 0.74486625, + "num_input_tokens_seen": 97130820, + "step": 4500, + "time_per_iteration": 2.4935522079467773 + }, + { + "auxiliary_loss_clip": 0.01158953, + "auxiliary_loss_mlp": 0.01027909, + "balance_loss_clip": 1.05154514, + "balance_loss_mlp": 1.02086663, + "epoch": 0.5412132507665485, + "flos": 21105132445440.0, + "grad_norm": 2.0953335547982097, + "language_loss": 0.87982357, + "learning_rate": 1.8303881510489818e-06, + "loss": 0.90169221, + "num_input_tokens_seen": 97149210, + "step": 4501, + "time_per_iteration": 3.255511522293091 + }, + { + "auxiliary_loss_clip": 0.01150309, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.05197513, + "balance_loss_mlp": 1.01922679, + "epoch": 0.5413334936571875, + "flos": 30227340205440.0, + "grad_norm": 2.464181921881129, + "language_loss": 0.69331312, + "learning_rate": 1.829611998540574e-06, + "loss": 0.71509397, + "num_input_tokens_seen": 97170415, + "step": 4502, + "time_per_iteration": 2.5475172996520996 + }, + { + "auxiliary_loss_clip": 0.01163828, + "auxiliary_loss_mlp": 0.0076218, + "balance_loss_clip": 1.05030084, + "balance_loss_mlp": 1.00030172, + "epoch": 0.5414537365478266, + "flos": 24279635606400.0, + "grad_norm": 2.1929211885137927, + "language_loss": 0.79975057, + "learning_rate": 1.8288358718798914e-06, + "loss": 0.81901073, + "num_input_tokens_seen": 97189605, + "step": 4503, + "time_per_iteration": 2.4653518199920654 + }, + { + "auxiliary_loss_clip": 0.01155891, + "auxiliary_loss_mlp": 0.00761946, + "balance_loss_clip": 1.04989314, + "balance_loss_mlp": 1.00035596, + "epoch": 0.5415739794384657, + "flos": 16654543735680.0, + "grad_norm": 1.7906156978222616, + "language_loss": 0.72438109, + "learning_rate": 1.8280597711846703e-06, + "loss": 0.74355948, + "num_input_tokens_seen": 97207845, + "step": 4504, + "time_per_iteration": 2.4137439727783203 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01022528, + "balance_loss_clip": 1.05430222, + "balance_loss_mlp": 1.01501155, + "epoch": 0.5416942223291048, + "flos": 23185724860800.0, + "grad_norm": 1.7553121924385462, + "language_loss": 0.83786285, + "learning_rate": 1.8272836965726455e-06, + "loss": 0.8597042, + "num_input_tokens_seen": 97226780, + "step": 4505, + "time_per_iteration": 3.207233190536499 + }, + { + "auxiliary_loss_clip": 0.01104327, + "auxiliary_loss_mlp": 0.01027141, + "balance_loss_clip": 1.04199433, + "balance_loss_mlp": 1.01915944, + "epoch": 0.5418144652197439, + "flos": 20303247271680.0, + "grad_norm": 1.7384649819730214, + "language_loss": 0.78313279, + "learning_rate": 1.8265076481615461e-06, + "loss": 0.80444747, + "num_input_tokens_seen": 97246695, + "step": 4506, + "time_per_iteration": 2.5814051628112793 + }, + { + "auxiliary_loss_clip": 0.01147569, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.05069005, + "balance_loss_mlp": 1.02223718, + "epoch": 0.541934708110383, + "flos": 12458633431680.0, + "grad_norm": 2.161994067558727, + "language_loss": 0.87442434, + "learning_rate": 1.8257316260690987e-06, + "loss": 0.89620566, + "num_input_tokens_seen": 97264480, + "step": 4507, + "time_per_iteration": 2.5089099407196045 + }, + { + "auxiliary_loss_clip": 0.01161769, + "auxiliary_loss_mlp": 0.01017635, + "balance_loss_clip": 1.05010569, + "balance_loss_mlp": 1.01073325, + "epoch": 0.5420549510010221, + "flos": 21253802837760.0, + "grad_norm": 1.4342178975487165, + "language_loss": 0.75699228, + "learning_rate": 1.8249556304130254e-06, + "loss": 0.7787863, + "num_input_tokens_seen": 97285760, + "step": 4508, + "time_per_iteration": 2.461242198944092 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.04738355, + "balance_loss_mlp": 1.02181256, + "epoch": 0.5421751938916611, + "flos": 29490524519040.0, + "grad_norm": 1.9113721024560573, + "language_loss": 0.68916261, + "learning_rate": 1.824179661311044e-06, + "loss": 0.71084273, + "num_input_tokens_seen": 97304510, + "step": 4509, + "time_per_iteration": 2.5242104530334473 + }, + { + "auxiliary_loss_clip": 0.01116441, + "auxiliary_loss_mlp": 0.01024462, + "balance_loss_clip": 1.04052949, + "balance_loss_mlp": 1.01673722, + "epoch": 0.5422954367823003, + "flos": 18734238311040.0, + "grad_norm": 1.9054684177628225, + "language_loss": 0.79972827, + "learning_rate": 1.823403718880868e-06, + "loss": 0.82113731, + "num_input_tokens_seen": 97323270, + "step": 4510, + "time_per_iteration": 2.5528883934020996 + }, + { + "auxiliary_loss_clip": 0.01146336, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.04513681, + "balance_loss_mlp": 1.02189541, + "epoch": 0.5424156796729394, + "flos": 39969006940800.0, + "grad_norm": 1.584981514043345, + "language_loss": 0.66644168, + "learning_rate": 1.822627803240207e-06, + "loss": 0.68820179, + "num_input_tokens_seen": 97345600, + "step": 4511, + "time_per_iteration": 2.646287441253662 + }, + { + "auxiliary_loss_clip": 0.01136932, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.04874229, + "balance_loss_mlp": 1.02182531, + "epoch": 0.5425359225635784, + "flos": 11546538353280.0, + "grad_norm": 2.1479674754870053, + "language_loss": 0.84785694, + "learning_rate": 1.8218519145067675e-06, + "loss": 0.86951733, + "num_input_tokens_seen": 97361220, + "step": 4512, + "time_per_iteration": 2.4993834495544434 + }, + { + "auxiliary_loss_clip": 0.01126343, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.04443717, + "balance_loss_mlp": 1.02036405, + "epoch": 0.5426561654542175, + "flos": 20229702174720.0, + "grad_norm": 2.1856135700902257, + "language_loss": 0.89349431, + "learning_rate": 1.8210760527982508e-06, + "loss": 0.91503847, + "num_input_tokens_seen": 97381505, + "step": 4513, + "time_per_iteration": 2.519304037094116 + }, + { + "auxiliary_loss_clip": 0.01149375, + "auxiliary_loss_mlp": 0.00762314, + "balance_loss_clip": 1.05110645, + "balance_loss_mlp": 1.00032353, + "epoch": 0.5427764083448566, + "flos": 21871681614720.0, + "grad_norm": 1.7386745694831183, + "language_loss": 0.75077903, + "learning_rate": 1.8203002182323552e-06, + "loss": 0.76989591, + "num_input_tokens_seen": 97399060, + "step": 4514, + "time_per_iteration": 2.5139713287353516 + }, + { + "auxiliary_loss_clip": 0.01152836, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.05245519, + "balance_loss_mlp": 1.01754737, + "epoch": 0.5428966512354957, + "flos": 19640946349440.0, + "grad_norm": 1.9764111876447796, + "language_loss": 0.7580114, + "learning_rate": 1.819524410926773e-06, + "loss": 0.779796, + "num_input_tokens_seen": 97416740, + "step": 4515, + "time_per_iteration": 2.479501247406006 + }, + { + "auxiliary_loss_clip": 0.01103346, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.04651427, + "balance_loss_mlp": 1.01951098, + "epoch": 0.5430168941261347, + "flos": 22382187661440.0, + "grad_norm": 1.4678912406432258, + "language_loss": 0.77089119, + "learning_rate": 1.8187486309991944e-06, + "loss": 0.79219925, + "num_input_tokens_seen": 97437620, + "step": 4516, + "time_per_iteration": 2.607738733291626 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.0521723, + "balance_loss_mlp": 1.0227294, + "epoch": 0.5431371370167739, + "flos": 18764187275520.0, + "grad_norm": 1.649220096398923, + "language_loss": 0.7728231, + "learning_rate": 1.817972878567304e-06, + "loss": 0.7947849, + "num_input_tokens_seen": 97456275, + "step": 4517, + "time_per_iteration": 2.4406261444091797 + }, + { + "auxiliary_loss_clip": 0.01151926, + "auxiliary_loss_mlp": 0.01027294, + "balance_loss_clip": 1.04838395, + "balance_loss_mlp": 1.02008462, + "epoch": 0.543257379907413, + "flos": 18806023641600.0, + "grad_norm": 1.7383614917474701, + "language_loss": 0.76066166, + "learning_rate": 1.8171971537487834e-06, + "loss": 0.78245389, + "num_input_tokens_seen": 97474925, + "step": 4518, + "time_per_iteration": 2.49815034866333 + }, + { + "auxiliary_loss_clip": 0.01174344, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.05001843, + "balance_loss_mlp": 1.02399635, + "epoch": 0.543377622798052, + "flos": 17493381025920.0, + "grad_norm": 1.830820324531856, + "language_loss": 0.80420929, + "learning_rate": 1.8164214566613093e-06, + "loss": 0.82627088, + "num_input_tokens_seen": 97493550, + "step": 4519, + "time_per_iteration": 2.411087989807129 + }, + { + "auxiliary_loss_clip": 0.01172077, + "auxiliary_loss_mlp": 0.01023134, + "balance_loss_clip": 1.04954863, + "balance_loss_mlp": 1.01589847, + "epoch": 0.5434978656886912, + "flos": 18989311766400.0, + "grad_norm": 4.114227920719771, + "language_loss": 0.65828776, + "learning_rate": 1.8156457874225547e-06, + "loss": 0.68023992, + "num_input_tokens_seen": 97512010, + "step": 4520, + "time_per_iteration": 2.4073808193206787 + }, + { + "auxiliary_loss_clip": 0.01139216, + "auxiliary_loss_mlp": 0.01023683, + "balance_loss_clip": 1.04899454, + "balance_loss_mlp": 1.01642895, + "epoch": 0.5436181085793302, + "flos": 17274936464640.0, + "grad_norm": 2.035172728444253, + "language_loss": 0.80638015, + "learning_rate": 1.814870146150187e-06, + "loss": 0.82800913, + "num_input_tokens_seen": 97530120, + "step": 4521, + "time_per_iteration": 2.4470407962799072 + }, + { + "auxiliary_loss_clip": 0.01152884, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04701042, + "balance_loss_mlp": 1.02381146, + "epoch": 0.5437383514699693, + "flos": 19098587917440.0, + "grad_norm": 1.91983004723685, + "language_loss": 0.78594959, + "learning_rate": 1.814094532961871e-06, + "loss": 0.80779231, + "num_input_tokens_seen": 97548695, + "step": 4522, + "time_per_iteration": 3.2551491260528564 + }, + { + "auxiliary_loss_clip": 0.01120395, + "auxiliary_loss_mlp": 0.01028786, + "balance_loss_clip": 1.04434252, + "balance_loss_mlp": 1.02104342, + "epoch": 0.5438585943606085, + "flos": 22602715211520.0, + "grad_norm": 1.846615871248621, + "language_loss": 0.83428198, + "learning_rate": 1.8133189479752666e-06, + "loss": 0.85577381, + "num_input_tokens_seen": 97567625, + "step": 4523, + "time_per_iteration": 2.554713249206543 + }, + { + "auxiliary_loss_clip": 0.01139024, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.02073824, + "epoch": 0.5439788372512475, + "flos": 21798495653760.0, + "grad_norm": 1.8881763630042188, + "language_loss": 0.81947517, + "learning_rate": 1.8125433913080292e-06, + "loss": 0.84114206, + "num_input_tokens_seen": 97585325, + "step": 4524, + "time_per_iteration": 2.479999303817749 + }, + { + "auxiliary_loss_clip": 0.01089927, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.0459249, + "balance_loss_mlp": 1.01899123, + "epoch": 0.5440990801418866, + "flos": 16399362539520.0, + "grad_norm": 2.0509675596091728, + "language_loss": 0.82522714, + "learning_rate": 1.811767863077811e-06, + "loss": 0.84638011, + "num_input_tokens_seen": 97604275, + "step": 4525, + "time_per_iteration": 2.579888105392456 + }, + { + "auxiliary_loss_clip": 0.01097783, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.04573405, + "balance_loss_mlp": 1.02042603, + "epoch": 0.5442193230325257, + "flos": 21615638492160.0, + "grad_norm": 1.6406692432042242, + "language_loss": 0.78299081, + "learning_rate": 1.8109923634022577e-06, + "loss": 0.80424678, + "num_input_tokens_seen": 97624300, + "step": 4526, + "time_per_iteration": 3.4092373847961426 + }, + { + "auxiliary_loss_clip": 0.01177091, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.05138004, + "balance_loss_mlp": 1.0180887, + "epoch": 0.5443395659231648, + "flos": 15481198062720.0, + "grad_norm": 1.9315214886879715, + "language_loss": 0.86531496, + "learning_rate": 1.8102168923990128e-06, + "loss": 0.88734448, + "num_input_tokens_seen": 97637845, + "step": 4527, + "time_per_iteration": 2.3992555141448975 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.00761583, + "balance_loss_clip": 1.05297613, + "balance_loss_mlp": 1.00039887, + "epoch": 0.5444598088138038, + "flos": 18770436241920.0, + "grad_norm": 1.792593614094289, + "language_loss": 0.79970706, + "learning_rate": 1.809441450185714e-06, + "loss": 0.81897336, + "num_input_tokens_seen": 97656330, + "step": 4528, + "time_per_iteration": 3.2790322303771973 + }, + { + "auxiliary_loss_clip": 0.011512, + "auxiliary_loss_mlp": 0.01024985, + "balance_loss_clip": 1.04636526, + "balance_loss_mlp": 1.01722479, + "epoch": 0.544580051704443, + "flos": 21142335957120.0, + "grad_norm": 2.0519548152882843, + "language_loss": 0.73393506, + "learning_rate": 1.8086660368799958e-06, + "loss": 0.75569689, + "num_input_tokens_seen": 97674380, + "step": 4529, + "time_per_iteration": 2.4725565910339355 + }, + { + "auxiliary_loss_clip": 0.01150644, + "auxiliary_loss_mlp": 0.01020466, + "balance_loss_clip": 1.05085957, + "balance_loss_mlp": 1.01263428, + "epoch": 0.5447002945950821, + "flos": 32491508054400.0, + "grad_norm": 1.7249744067724055, + "language_loss": 0.77068698, + "learning_rate": 1.807890652599488e-06, + "loss": 0.7923981, + "num_input_tokens_seen": 97698765, + "step": 4530, + "time_per_iteration": 2.585057258605957 + }, + { + "auxiliary_loss_clip": 0.01172829, + "auxiliary_loss_mlp": 0.0102648, + "balance_loss_clip": 1.05118966, + "balance_loss_mlp": 1.01974797, + "epoch": 0.5448205374857211, + "flos": 11798307757440.0, + "grad_norm": 4.200980973443889, + "language_loss": 0.82388544, + "learning_rate": 1.8071152974618156e-06, + "loss": 0.84587854, + "num_input_tokens_seen": 97716565, + "step": 4531, + "time_per_iteration": 3.148620128631592 + }, + { + "auxiliary_loss_clip": 0.01134952, + "auxiliary_loss_mlp": 0.00761862, + "balance_loss_clip": 1.04592037, + "balance_loss_mlp": 1.00027609, + "epoch": 0.5449407803763603, + "flos": 24133766474880.0, + "grad_norm": 2.3330085471769504, + "language_loss": 0.78289199, + "learning_rate": 1.806339971584599e-06, + "loss": 0.80186015, + "num_input_tokens_seen": 97733225, + "step": 4532, + "time_per_iteration": 2.5186805725097656 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.05073059, + "balance_loss_mlp": 1.01780176, + "epoch": 0.5450610232669993, + "flos": 23258551685760.0, + "grad_norm": 2.000358025561131, + "language_loss": 0.8523401, + "learning_rate": 1.8055646750854546e-06, + "loss": 0.87433922, + "num_input_tokens_seen": 97752735, + "step": 4533, + "time_per_iteration": 2.430197238922119 + }, + { + "auxiliary_loss_clip": 0.01149936, + "auxiliary_loss_mlp": 0.01023404, + "balance_loss_clip": 1.04878426, + "balance_loss_mlp": 1.01578069, + "epoch": 0.5451812661576384, + "flos": 17785083375360.0, + "grad_norm": 2.186783014702608, + "language_loss": 0.81599176, + "learning_rate": 1.8047894080819945e-06, + "loss": 0.83772516, + "num_input_tokens_seen": 97769985, + "step": 4534, + "time_per_iteration": 2.4424550533294678 + }, + { + "auxiliary_loss_clip": 0.01077564, + "auxiliary_loss_mlp": 0.01000203, + "balance_loss_clip": 1.01713634, + "balance_loss_mlp": 0.99932688, + "epoch": 0.5453015090482776, + "flos": 71062586513280.0, + "grad_norm": 0.7222235935203467, + "language_loss": 0.63152587, + "learning_rate": 1.8040141706918258e-06, + "loss": 0.65230346, + "num_input_tokens_seen": 97831225, + "step": 4535, + "time_per_iteration": 3.1208040714263916 + }, + { + "auxiliary_loss_clip": 0.01148813, + "auxiliary_loss_mlp": 0.0102608, + "balance_loss_clip": 1.05027533, + "balance_loss_mlp": 1.01842642, + "epoch": 0.5454217519389166, + "flos": 25552201622400.0, + "grad_norm": 1.7454166229967965, + "language_loss": 0.76542628, + "learning_rate": 1.8032389630325525e-06, + "loss": 0.78717518, + "num_input_tokens_seen": 97849975, + "step": 4536, + "time_per_iteration": 2.5153892040252686 + }, + { + "auxiliary_loss_clip": 0.01144809, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.0441432, + "balance_loss_mlp": 1.02153361, + "epoch": 0.5455419948295557, + "flos": 23658345037440.0, + "grad_norm": 1.6372953379951058, + "language_loss": 0.75886774, + "learning_rate": 1.8024637852217707e-06, + "loss": 0.78061038, + "num_input_tokens_seen": 97869700, + "step": 4537, + "time_per_iteration": 2.5058796405792236 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.04838705, + "balance_loss_mlp": 1.02599454, + "epoch": 0.5456622377201948, + "flos": 23403989854080.0, + "grad_norm": 1.8888518967027725, + "language_loss": 0.84811908, + "learning_rate": 1.8016886373770766e-06, + "loss": 0.86992419, + "num_input_tokens_seen": 97888215, + "step": 4538, + "time_per_iteration": 2.4896507263183594 + }, + { + "auxiliary_loss_clip": 0.01147538, + "auxiliary_loss_mlp": 0.0102518, + "balance_loss_clip": 1.04890561, + "balance_loss_mlp": 1.01766419, + "epoch": 0.5457824806108339, + "flos": 23988040997760.0, + "grad_norm": 1.70619246750523, + "language_loss": 0.78872037, + "learning_rate": 1.8009135196160579e-06, + "loss": 0.81044763, + "num_input_tokens_seen": 97907090, + "step": 4539, + "time_per_iteration": 2.5665969848632812 + }, + { + "auxiliary_loss_clip": 0.01129304, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.04558575, + "balance_loss_mlp": 1.02077579, + "epoch": 0.545902723501473, + "flos": 22565870835840.0, + "grad_norm": 1.6616023518629373, + "language_loss": 0.84290826, + "learning_rate": 1.8001384320563e-06, + "loss": 0.86447465, + "num_input_tokens_seen": 97927345, + "step": 4540, + "time_per_iteration": 2.5505895614624023 + }, + { + "auxiliary_loss_clip": 0.01076303, + "auxiliary_loss_mlp": 0.01000217, + "balance_loss_clip": 1.01617897, + "balance_loss_mlp": 0.99933493, + "epoch": 0.5460229663921121, + "flos": 55198399685760.0, + "grad_norm": 0.7749301543654749, + "language_loss": 0.57799071, + "learning_rate": 1.7993633748153833e-06, + "loss": 0.5987559, + "num_input_tokens_seen": 97981950, + "step": 4541, + "time_per_iteration": 2.912912368774414 + }, + { + "auxiliary_loss_clip": 0.01165002, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.05021369, + "balance_loss_mlp": 1.02117836, + "epoch": 0.5461432092827512, + "flos": 15413866018560.0, + "grad_norm": 2.0416428692083866, + "language_loss": 0.73028022, + "learning_rate": 1.7985883480108834e-06, + "loss": 0.75221699, + "num_input_tokens_seen": 97999585, + "step": 4542, + "time_per_iteration": 2.424006700515747 + }, + { + "auxiliary_loss_clip": 0.01156192, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.04839957, + "balance_loss_mlp": 1.02288282, + "epoch": 0.5462634521733902, + "flos": 24024921287040.0, + "grad_norm": 1.7501600149856733, + "language_loss": 0.71994466, + "learning_rate": 1.797813351760371e-06, + "loss": 0.74181461, + "num_input_tokens_seen": 98021290, + "step": 4543, + "time_per_iteration": 2.481827735900879 + }, + { + "auxiliary_loss_clip": 0.01176667, + "auxiliary_loss_mlp": 0.0102296, + "balance_loss_clip": 1.052508, + "balance_loss_mlp": 1.01519907, + "epoch": 0.5463836950640293, + "flos": 22820944291200.0, + "grad_norm": 1.836872192055815, + "language_loss": 0.77999276, + "learning_rate": 1.7970383861814116e-06, + "loss": 0.80198902, + "num_input_tokens_seen": 98041060, + "step": 4544, + "time_per_iteration": 2.439911127090454 + }, + { + "auxiliary_loss_clip": 0.01160459, + "auxiliary_loss_mlp": 0.01025248, + "balance_loss_clip": 1.051036, + "balance_loss_mlp": 1.01744556, + "epoch": 0.5465039379546685, + "flos": 20448290390400.0, + "grad_norm": 1.8993314336524234, + "language_loss": 0.74120528, + "learning_rate": 1.7962634513915684e-06, + "loss": 0.76306236, + "num_input_tokens_seen": 98058410, + "step": 4545, + "time_per_iteration": 2.43542742729187 + }, + { + "auxiliary_loss_clip": 0.01173234, + "auxiliary_loss_mlp": 0.01021611, + "balance_loss_clip": 1.0507493, + "balance_loss_mlp": 1.01440763, + "epoch": 0.5466241808453075, + "flos": 17343310003200.0, + "grad_norm": 1.8499934561082187, + "language_loss": 0.79394221, + "learning_rate": 1.7954885475083969e-06, + "loss": 0.81589067, + "num_input_tokens_seen": 98076080, + "step": 4546, + "time_per_iteration": 2.386650800704956 + }, + { + "auxiliary_loss_clip": 0.01177076, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.0522691, + "balance_loss_mlp": 1.02276933, + "epoch": 0.5467444237359466, + "flos": 21617039122560.0, + "grad_norm": 2.683068920328305, + "language_loss": 0.7231003, + "learning_rate": 1.7947136746494513e-06, + "loss": 0.74517381, + "num_input_tokens_seen": 98096995, + "step": 4547, + "time_per_iteration": 2.4507923126220703 + }, + { + "auxiliary_loss_clip": 0.01160442, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.05037403, + "balance_loss_mlp": 1.02111673, + "epoch": 0.5468646666265857, + "flos": 24170467196160.0, + "grad_norm": 2.0244713422737535, + "language_loss": 0.87999213, + "learning_rate": 1.793938832932277e-06, + "loss": 0.90188122, + "num_input_tokens_seen": 98115105, + "step": 4548, + "time_per_iteration": 2.543168544769287 + }, + { + "auxiliary_loss_clip": 0.01175335, + "auxiliary_loss_mlp": 0.01022658, + "balance_loss_clip": 1.0509218, + "balance_loss_mlp": 1.01533258, + "epoch": 0.5469849095172248, + "flos": 27527001505920.0, + "grad_norm": 3.9813891389050253, + "language_loss": 0.70256078, + "learning_rate": 1.7931640224744185e-06, + "loss": 0.72454071, + "num_input_tokens_seen": 98135655, + "step": 4549, + "time_per_iteration": 3.3618717193603516 + }, + { + "auxiliary_loss_clip": 0.0111812, + "auxiliary_loss_mlp": 0.01024883, + "balance_loss_clip": 1.04001212, + "balance_loss_mlp": 1.01751029, + "epoch": 0.5471051524078638, + "flos": 27964680727680.0, + "grad_norm": 1.6906578686327298, + "language_loss": 0.73609936, + "learning_rate": 1.7923892433934127e-06, + "loss": 0.75752944, + "num_input_tokens_seen": 98156730, + "step": 4550, + "time_per_iteration": 2.6062722206115723 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.00762469, + "balance_loss_clip": 1.04947627, + "balance_loss_mlp": 1.00027156, + "epoch": 0.547225395298503, + "flos": 18150510389760.0, + "grad_norm": 1.7988293326444136, + "language_loss": 0.78776646, + "learning_rate": 1.7916144958067939e-06, + "loss": 0.80688763, + "num_input_tokens_seen": 98174590, + "step": 4551, + "time_per_iteration": 2.458024263381958 + }, + { + "auxiliary_loss_clip": 0.01162338, + "auxiliary_loss_mlp": 0.0102254, + "balance_loss_clip": 1.04953206, + "balance_loss_mlp": 1.01510155, + "epoch": 0.5473456381891421, + "flos": 21361498790400.0, + "grad_norm": 1.6765500906810593, + "language_loss": 0.78673708, + "learning_rate": 1.7908397798320905e-06, + "loss": 0.80858582, + "num_input_tokens_seen": 98194325, + "step": 4552, + "time_per_iteration": 2.4699718952178955 + }, + { + "auxiliary_loss_clip": 0.01160948, + "auxiliary_loss_mlp": 0.00762508, + "balance_loss_clip": 1.04991162, + "balance_loss_mlp": 1.00033486, + "epoch": 0.5474658810797811, + "flos": 19932145908480.0, + "grad_norm": 1.8051822545327347, + "language_loss": 0.74956799, + "learning_rate": 1.7900650955868265e-06, + "loss": 0.76880252, + "num_input_tokens_seen": 98213970, + "step": 4553, + "time_per_iteration": 3.324789524078369 + }, + { + "auxiliary_loss_clip": 0.01160909, + "auxiliary_loss_mlp": 0.0076161, + "balance_loss_clip": 1.05158138, + "balance_loss_mlp": 1.00024807, + "epoch": 0.5475861239704203, + "flos": 50476217264640.0, + "grad_norm": 1.6981592132020786, + "language_loss": 0.76457, + "learning_rate": 1.7892904431885202e-06, + "loss": 0.78379524, + "num_input_tokens_seen": 98241145, + "step": 4554, + "time_per_iteration": 2.726081371307373 + }, + { + "auxiliary_loss_clip": 0.01118066, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.04255903, + "balance_loss_mlp": 1.02022946, + "epoch": 0.5477063668610593, + "flos": 20705123612160.0, + "grad_norm": 2.6497862896537736, + "language_loss": 0.75188208, + "learning_rate": 1.788515822754686e-06, + "loss": 0.77333462, + "num_input_tokens_seen": 98261565, + "step": 4555, + "time_per_iteration": 3.524815082550049 + }, + { + "auxiliary_loss_clip": 0.01133472, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.04474831, + "balance_loss_mlp": 1.02174652, + "epoch": 0.5478266097516984, + "flos": 19609740408960.0, + "grad_norm": 2.713399239387335, + "language_loss": 0.78099608, + "learning_rate": 1.7877412344028335e-06, + "loss": 0.80262542, + "num_input_tokens_seen": 98281370, + "step": 4556, + "time_per_iteration": 2.532320737838745 + }, + { + "auxiliary_loss_clip": 0.01162166, + "auxiliary_loss_mlp": 0.01021516, + "balance_loss_clip": 1.04914927, + "balance_loss_mlp": 1.0140655, + "epoch": 0.5479468526423376, + "flos": 12896599962240.0, + "grad_norm": 2.202349931979879, + "language_loss": 0.77485198, + "learning_rate": 1.7869666782504668e-06, + "loss": 0.7966888, + "num_input_tokens_seen": 98297950, + "step": 4557, + "time_per_iteration": 2.463698387145996 + }, + { + "auxiliary_loss_clip": 0.01132633, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.04283547, + "balance_loss_mlp": 1.01650059, + "epoch": 0.5480670955329766, + "flos": 18588800142720.0, + "grad_norm": 2.424999626467018, + "language_loss": 0.68952286, + "learning_rate": 1.7861921544150867e-06, + "loss": 0.71109009, + "num_input_tokens_seen": 98316800, + "step": 4558, + "time_per_iteration": 3.2716052532196045 + }, + { + "auxiliary_loss_clip": 0.01091073, + "auxiliary_loss_mlp": 0.00761889, + "balance_loss_clip": 1.04247427, + "balance_loss_mlp": 1.00020099, + "epoch": 0.5481873384236157, + "flos": 15954608338560.0, + "grad_norm": 1.8777549783472427, + "language_loss": 0.76459134, + "learning_rate": 1.7854176630141856e-06, + "loss": 0.78312099, + "num_input_tokens_seen": 98333935, + "step": 4559, + "time_per_iteration": 2.6099140644073486 + }, + { + "auxiliary_loss_clip": 0.01179481, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.05334163, + "balance_loss_mlp": 1.02925944, + "epoch": 0.5483075813142548, + "flos": 22783812606720.0, + "grad_norm": 2.117852096102336, + "language_loss": 0.84697604, + "learning_rate": 1.784643204165255e-06, + "loss": 0.86914062, + "num_input_tokens_seen": 98353255, + "step": 4560, + "time_per_iteration": 2.4737913608551025 + }, + { + "auxiliary_loss_clip": 0.01155709, + "auxiliary_loss_mlp": 0.01024309, + "balance_loss_clip": 1.05070949, + "balance_loss_mlp": 1.01701951, + "epoch": 0.5484278242048939, + "flos": 19317212046720.0, + "grad_norm": 1.929608792232688, + "language_loss": 0.77659905, + "learning_rate": 1.7838687779857783e-06, + "loss": 0.79839921, + "num_input_tokens_seen": 98371130, + "step": 4561, + "time_per_iteration": 2.4686620235443115 + }, + { + "auxiliary_loss_clip": 0.01138915, + "auxiliary_loss_mlp": 0.01026644, + "balance_loss_clip": 1.04524457, + "balance_loss_mlp": 1.01859772, + "epoch": 0.5485480670955329, + "flos": 22816024128000.0, + "grad_norm": 2.525157956433148, + "language_loss": 0.63625896, + "learning_rate": 1.7830943845932366e-06, + "loss": 0.65791452, + "num_input_tokens_seen": 98390455, + "step": 4562, + "time_per_iteration": 2.511744976043701 + }, + { + "auxiliary_loss_clip": 0.01148393, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.04811239, + "balance_loss_mlp": 1.02007735, + "epoch": 0.5486683099861721, + "flos": 22671304231680.0, + "grad_norm": 1.7376921800225773, + "language_loss": 0.7511009, + "learning_rate": 1.7823200241051044e-06, + "loss": 0.77285671, + "num_input_tokens_seen": 98409370, + "step": 4563, + "time_per_iteration": 2.5301826000213623 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01022712, + "balance_loss_clip": 1.0509963, + "balance_loss_mlp": 1.01518416, + "epoch": 0.5487885528768112, + "flos": 23149383275520.0, + "grad_norm": 1.8530807796113584, + "language_loss": 0.80451554, + "learning_rate": 1.7815456966388513e-06, + "loss": 0.82648993, + "num_input_tokens_seen": 98428465, + "step": 4564, + "time_per_iteration": 2.4201343059539795 + }, + { + "auxiliary_loss_clip": 0.01133188, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.04530263, + "balance_loss_mlp": 1.02349341, + "epoch": 0.5489087957674502, + "flos": 22053928245120.0, + "grad_norm": 2.242278662268403, + "language_loss": 0.80838966, + "learning_rate": 1.780771402311943e-06, + "loss": 0.8300308, + "num_input_tokens_seen": 98447300, + "step": 4565, + "time_per_iteration": 2.5529279708862305 + }, + { + "auxiliary_loss_clip": 0.01145576, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.04783845, + "balance_loss_mlp": 1.02119112, + "epoch": 0.5490290386580894, + "flos": 24315977191680.0, + "grad_norm": 1.9985236507139452, + "language_loss": 0.78857285, + "learning_rate": 1.7799971412418374e-06, + "loss": 0.81031871, + "num_input_tokens_seen": 98468695, + "step": 4566, + "time_per_iteration": 2.515193462371826 + }, + { + "auxiliary_loss_clip": 0.01135284, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.04826117, + "balance_loss_mlp": 1.01838374, + "epoch": 0.5491492815487284, + "flos": 18294942977280.0, + "grad_norm": 11.723893213270623, + "language_loss": 0.73885345, + "learning_rate": 1.7792229135459918e-06, + "loss": 0.76046616, + "num_input_tokens_seen": 98485345, + "step": 4567, + "time_per_iteration": 2.4846317768096924 + }, + { + "auxiliary_loss_clip": 0.01045923, + "auxiliary_loss_mlp": 0.01020286, + "balance_loss_clip": 1.03527236, + "balance_loss_mlp": 1.01878965, + "epoch": 0.5492695244393675, + "flos": 64550257050240.0, + "grad_norm": 0.7975063242764259, + "language_loss": 0.61580998, + "learning_rate": 1.7784487193418538e-06, + "loss": 0.63647211, + "num_input_tokens_seen": 98543195, + "step": 4568, + "time_per_iteration": 3.056511878967285 + }, + { + "auxiliary_loss_clip": 0.01117544, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.04118085, + "balance_loss_mlp": 1.01760793, + "epoch": 0.5493897673300067, + "flos": 17379579761280.0, + "grad_norm": 1.977251591286577, + "language_loss": 0.61347437, + "learning_rate": 1.7776745587468698e-06, + "loss": 0.63490689, + "num_input_tokens_seen": 98560620, + "step": 4569, + "time_per_iteration": 2.5322492122650146 + }, + { + "auxiliary_loss_clip": 0.01172484, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.04877591, + "balance_loss_mlp": 1.02223301, + "epoch": 0.5495100102206457, + "flos": 19901765980800.0, + "grad_norm": 2.280690154085733, + "language_loss": 0.81421161, + "learning_rate": 1.7769004318784776e-06, + "loss": 0.83623374, + "num_input_tokens_seen": 98578265, + "step": 4570, + "time_per_iteration": 2.4282124042510986 + }, + { + "auxiliary_loss_clip": 0.01162038, + "auxiliary_loss_mlp": 0.01022539, + "balance_loss_clip": 1.04976487, + "balance_loss_mlp": 1.01530933, + "epoch": 0.5496302531112848, + "flos": 16727190992640.0, + "grad_norm": 2.3230471837682507, + "language_loss": 0.80471182, + "learning_rate": 1.776126338854113e-06, + "loss": 0.82655752, + "num_input_tokens_seen": 98596055, + "step": 4571, + "time_per_iteration": 2.436657667160034 + }, + { + "auxiliary_loss_clip": 0.01158951, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.05287945, + "balance_loss_mlp": 1.01882625, + "epoch": 0.5497504960019239, + "flos": 24572343536640.0, + "grad_norm": 1.6746441438583883, + "language_loss": 0.84460211, + "learning_rate": 1.7753522797912044e-06, + "loss": 0.86645204, + "num_input_tokens_seen": 98616140, + "step": 4572, + "time_per_iteration": 2.495299816131592 + }, + { + "auxiliary_loss_clip": 0.01152074, + "auxiliary_loss_mlp": 0.01024062, + "balance_loss_clip": 1.04634762, + "balance_loss_mlp": 1.01652193, + "epoch": 0.549870738892563, + "flos": 15450494912640.0, + "grad_norm": 2.2534041290342057, + "language_loss": 0.69912559, + "learning_rate": 1.7745782548071765e-06, + "loss": 0.72088695, + "num_input_tokens_seen": 98633035, + "step": 4573, + "time_per_iteration": 2.463361978530884 + }, + { + "auxiliary_loss_clip": 0.01131782, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.0534482, + "balance_loss_mlp": 1.02428865, + "epoch": 0.549990981783202, + "flos": 21069114082560.0, + "grad_norm": 1.6027613517811543, + "language_loss": 0.74147856, + "learning_rate": 1.7738042640194482e-06, + "loss": 0.76311398, + "num_input_tokens_seen": 98652700, + "step": 4574, + "time_per_iteration": 2.513716220855713 + }, + { + "auxiliary_loss_clip": 0.01174342, + "auxiliary_loss_mlp": 0.0102484, + "balance_loss_clip": 1.05057776, + "balance_loss_mlp": 1.01723456, + "epoch": 0.5501112246738411, + "flos": 21395901041280.0, + "grad_norm": 4.283970291102364, + "language_loss": 0.70605361, + "learning_rate": 1.7730303075454335e-06, + "loss": 0.72804546, + "num_input_tokens_seen": 98671590, + "step": 4575, + "time_per_iteration": 2.417705535888672 + }, + { + "auxiliary_loss_clip": 0.0113337, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.04462266, + "balance_loss_mlp": 1.01873326, + "epoch": 0.5502314675644803, + "flos": 17456931699840.0, + "grad_norm": 1.8406541595085508, + "language_loss": 0.85018128, + "learning_rate": 1.7722563855025402e-06, + "loss": 0.87178397, + "num_input_tokens_seen": 98689620, + "step": 4576, + "time_per_iteration": 3.239882707595825 + }, + { + "auxiliary_loss_clip": 0.01146196, + "auxiliary_loss_mlp": 0.01020372, + "balance_loss_clip": 1.0441941, + "balance_loss_mlp": 1.01271927, + "epoch": 0.5503517104551193, + "flos": 24310410583680.0, + "grad_norm": 2.5152415230034992, + "language_loss": 0.70804691, + "learning_rate": 1.7714824980081721e-06, + "loss": 0.72971261, + "num_input_tokens_seen": 98708915, + "step": 4577, + "time_per_iteration": 2.530142068862915 + }, + { + "auxiliary_loss_clip": 0.01158352, + "auxiliary_loss_mlp": 0.01021886, + "balance_loss_clip": 1.05130887, + "balance_loss_mlp": 1.01495147, + "epoch": 0.5504719533457584, + "flos": 22419427086720.0, + "grad_norm": 1.6876752239705388, + "language_loss": 0.73926985, + "learning_rate": 1.7707086451797276e-06, + "loss": 0.76107216, + "num_input_tokens_seen": 98729790, + "step": 4578, + "time_per_iteration": 2.4867489337921143 + }, + { + "auxiliary_loss_clip": 0.01041873, + "auxiliary_loss_mlp": 0.0100182, + "balance_loss_clip": 1.01362205, + "balance_loss_mlp": 1.00084198, + "epoch": 0.5505921962363975, + "flos": 67294155968640.0, + "grad_norm": 0.6992956562704757, + "language_loss": 0.52346671, + "learning_rate": 1.7699348271345993e-06, + "loss": 0.54390365, + "num_input_tokens_seen": 98792415, + "step": 4579, + "time_per_iteration": 3.89481520652771 + }, + { + "auxiliary_loss_clip": 0.01037737, + "auxiliary_loss_mlp": 0.01003438, + "balance_loss_clip": 1.01361394, + "balance_loss_mlp": 1.00232971, + "epoch": 0.5507124391270366, + "flos": 45685125578880.0, + "grad_norm": 3.1794472675558203, + "language_loss": 0.54430765, + "learning_rate": 1.7691610439901753e-06, + "loss": 0.56471938, + "num_input_tokens_seen": 98855350, + "step": 4580, + "time_per_iteration": 3.1726150512695312 + }, + { + "auxiliary_loss_clip": 0.01163625, + "auxiliary_loss_mlp": 0.01026128, + "balance_loss_clip": 1.05114615, + "balance_loss_mlp": 1.01914787, + "epoch": 0.5508326820176757, + "flos": 22273845264000.0, + "grad_norm": 1.795442319083522, + "language_loss": 0.75315332, + "learning_rate": 1.7683872958638367e-06, + "loss": 0.77505088, + "num_input_tokens_seen": 98874230, + "step": 4581, + "time_per_iteration": 2.5027122497558594 + }, + { + "auxiliary_loss_clip": 0.01142542, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.04526246, + "balance_loss_mlp": 1.01792121, + "epoch": 0.5509529249083148, + "flos": 20012442762240.0, + "grad_norm": 2.3341089321278923, + "language_loss": 0.84708309, + "learning_rate": 1.7676135828729614e-06, + "loss": 0.86876488, + "num_input_tokens_seen": 98893940, + "step": 4582, + "time_per_iteration": 3.3754777908325195 + }, + { + "auxiliary_loss_clip": 0.0116154, + "auxiliary_loss_mlp": 0.01026191, + "balance_loss_clip": 1.05121076, + "balance_loss_mlp": 1.01863003, + "epoch": 0.5510731677989539, + "flos": 21834801325440.0, + "grad_norm": 2.546083563910847, + "language_loss": 0.82716769, + "learning_rate": 1.7668399051349205e-06, + "loss": 0.84904498, + "num_input_tokens_seen": 98913620, + "step": 4583, + "time_per_iteration": 2.481414794921875 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.01021433, + "balance_loss_clip": 1.04554474, + "balance_loss_mlp": 1.01402414, + "epoch": 0.5511934106895929, + "flos": 21467901853440.0, + "grad_norm": 1.9774319854423994, + "language_loss": 0.83366269, + "learning_rate": 1.766066262767081e-06, + "loss": 0.85517067, + "num_input_tokens_seen": 98931460, + "step": 4584, + "time_per_iteration": 2.5133705139160156 + }, + { + "auxiliary_loss_clip": 0.01143608, + "auxiliary_loss_mlp": 0.01023765, + "balance_loss_clip": 1.05011714, + "balance_loss_mlp": 1.01636803, + "epoch": 0.5513136535802321, + "flos": 21068934514560.0, + "grad_norm": 2.11975912492017, + "language_loss": 0.77296448, + "learning_rate": 1.765292655886803e-06, + "loss": 0.79463822, + "num_input_tokens_seen": 98950105, + "step": 4585, + "time_per_iteration": 3.266927719116211 + }, + { + "auxiliary_loss_clip": 0.01137984, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.04614711, + "balance_loss_mlp": 1.01968932, + "epoch": 0.5514338964708712, + "flos": 27815004754560.0, + "grad_norm": 2.316459250751171, + "language_loss": 0.70491791, + "learning_rate": 1.764519084611443e-06, + "loss": 0.72657001, + "num_input_tokens_seen": 98970560, + "step": 4586, + "time_per_iteration": 2.554762840270996 + }, + { + "auxiliary_loss_clip": 0.01144834, + "auxiliary_loss_mlp": 0.01025652, + "balance_loss_clip": 1.04485667, + "balance_loss_mlp": 1.01716995, + "epoch": 0.5515541393615102, + "flos": 21908525990400.0, + "grad_norm": 1.9169869126700863, + "language_loss": 0.77876776, + "learning_rate": 1.7637455490583505e-06, + "loss": 0.80047262, + "num_input_tokens_seen": 98989885, + "step": 4587, + "time_per_iteration": 2.5103631019592285 + }, + { + "auxiliary_loss_clip": 0.01160474, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.05094504, + "balance_loss_mlp": 1.01883531, + "epoch": 0.5516743822521494, + "flos": 20485422074880.0, + "grad_norm": 2.1088689234778375, + "language_loss": 0.77264249, + "learning_rate": 1.7629720493448701e-06, + "loss": 0.794505, + "num_input_tokens_seen": 99007180, + "step": 4588, + "time_per_iteration": 2.4755096435546875 + }, + { + "auxiliary_loss_clip": 0.01155645, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.04966128, + "balance_loss_mlp": 1.02235818, + "epoch": 0.5517946251427884, + "flos": 14940383915520.0, + "grad_norm": 2.221206655272309, + "language_loss": 0.85483432, + "learning_rate": 1.7621985855883418e-06, + "loss": 0.87668824, + "num_input_tokens_seen": 99023880, + "step": 4589, + "time_per_iteration": 2.4562203884124756 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01022181, + "balance_loss_clip": 1.0477711, + "balance_loss_mlp": 1.0145998, + "epoch": 0.5519148680334275, + "flos": 18404865573120.0, + "grad_norm": 1.9633910655858247, + "language_loss": 0.72875792, + "learning_rate": 1.7614251579060983e-06, + "loss": 0.75039393, + "num_input_tokens_seen": 99042475, + "step": 4590, + "time_per_iteration": 2.4668638706207275 + }, + { + "auxiliary_loss_clip": 0.01135538, + "auxiliary_loss_mlp": 0.01025429, + "balance_loss_clip": 1.04711854, + "balance_loss_mlp": 1.01762688, + "epoch": 0.5520351109240667, + "flos": 25113337251840.0, + "grad_norm": 1.7528165984326978, + "language_loss": 0.84657156, + "learning_rate": 1.76065176641547e-06, + "loss": 0.86818123, + "num_input_tokens_seen": 99065185, + "step": 4591, + "time_per_iteration": 2.5800955295562744 + }, + { + "auxiliary_loss_clip": 0.01159643, + "auxiliary_loss_mlp": 0.01022772, + "balance_loss_clip": 1.04646087, + "balance_loss_mlp": 1.01492786, + "epoch": 0.5521553538147057, + "flos": 21069545045760.0, + "grad_norm": 1.8201871631045887, + "language_loss": 0.78194022, + "learning_rate": 1.759878411233777e-06, + "loss": 0.80376434, + "num_input_tokens_seen": 99083645, + "step": 4592, + "time_per_iteration": 2.4513750076293945 + }, + { + "auxiliary_loss_clip": 0.01158278, + "auxiliary_loss_mlp": 0.01023502, + "balance_loss_clip": 1.04787374, + "balance_loss_mlp": 1.0156281, + "epoch": 0.5522755967053448, + "flos": 18879999701760.0, + "grad_norm": 2.23111348106347, + "language_loss": 0.75911099, + "learning_rate": 1.7591050924783388e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 99100835, + "step": 4593, + "time_per_iteration": 2.435822010040283 + }, + { + "auxiliary_loss_clip": 0.01035245, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.01448512, + "balance_loss_mlp": 1.0012188, + "epoch": 0.5523958395959839, + "flos": 64675622494080.0, + "grad_norm": 0.8409316010093588, + "language_loss": 0.57956284, + "learning_rate": 1.7583318102664661e-06, + "loss": 0.59993804, + "num_input_tokens_seen": 99168400, + "step": 4594, + "time_per_iteration": 3.162726879119873 + }, + { + "auxiliary_loss_clip": 0.01163231, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.04684472, + "balance_loss_mlp": 1.01982021, + "epoch": 0.552516082486623, + "flos": 10889732211840.0, + "grad_norm": 2.0834189121283617, + "language_loss": 0.7913065, + "learning_rate": 1.757558564715466e-06, + "loss": 0.81321263, + "num_input_tokens_seen": 99186475, + "step": 4595, + "time_per_iteration": 2.448741912841797 + }, + { + "auxiliary_loss_clip": 0.01161826, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.04710138, + "balance_loss_mlp": 1.01874757, + "epoch": 0.552636325377262, + "flos": 22199797376640.0, + "grad_norm": 2.5948709360976223, + "language_loss": 0.74100387, + "learning_rate": 1.7567853559426386e-06, + "loss": 0.76288682, + "num_input_tokens_seen": 99203525, + "step": 4596, + "time_per_iteration": 2.4705755710601807 + }, + { + "auxiliary_loss_clip": 0.01162587, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.04906225, + "balance_loss_mlp": 1.02059984, + "epoch": 0.5527565682679012, + "flos": 23988184652160.0, + "grad_norm": 1.954694638151164, + "language_loss": 0.75518489, + "learning_rate": 1.7560121840652797e-06, + "loss": 0.77708936, + "num_input_tokens_seen": 99222910, + "step": 4597, + "time_per_iteration": 2.4726083278656006 + }, + { + "auxiliary_loss_clip": 0.01123301, + "auxiliary_loss_mlp": 0.01021177, + "balance_loss_clip": 1.04606032, + "balance_loss_mlp": 1.01351833, + "epoch": 0.5528768111585403, + "flos": 19719267955200.0, + "grad_norm": 1.7970635243716697, + "language_loss": 0.69439077, + "learning_rate": 1.7552390492006782e-06, + "loss": 0.71583557, + "num_input_tokens_seen": 99241230, + "step": 4598, + "time_per_iteration": 2.5285274982452393 + }, + { + "auxiliary_loss_clip": 0.0112505, + "auxiliary_loss_mlp": 0.00761991, + "balance_loss_clip": 1.04282832, + "balance_loss_mlp": 1.00033128, + "epoch": 0.5529970540491793, + "flos": 26215975002240.0, + "grad_norm": 2.841268767851113, + "language_loss": 0.65259719, + "learning_rate": 1.7544659514661184e-06, + "loss": 0.67146754, + "num_input_tokens_seen": 99264320, + "step": 4599, + "time_per_iteration": 2.6457877159118652 + }, + { + "auxiliary_loss_clip": 0.01142168, + "auxiliary_loss_mlp": 0.01022629, + "balance_loss_clip": 1.04485798, + "balance_loss_mlp": 1.01534522, + "epoch": 0.5531172969398185, + "flos": 24425971614720.0, + "grad_norm": 2.0676723586161323, + "language_loss": 0.79505628, + "learning_rate": 1.7536928909788786e-06, + "loss": 0.81670427, + "num_input_tokens_seen": 99283625, + "step": 4600, + "time_per_iteration": 2.519928455352783 + }, + { + "auxiliary_loss_clip": 0.01038012, + "auxiliary_loss_mlp": 0.01002878, + "balance_loss_clip": 1.01439595, + "balance_loss_mlp": 1.00188303, + "epoch": 0.5532375398304575, + "flos": 64907316195840.0, + "grad_norm": 0.8867001722156724, + "language_loss": 0.61995786, + "learning_rate": 1.752919867856231e-06, + "loss": 0.64036679, + "num_input_tokens_seen": 99335270, + "step": 4601, + "time_per_iteration": 2.9552173614501953 + }, + { + "auxiliary_loss_clip": 0.01137935, + "auxiliary_loss_mlp": 0.01024868, + "balance_loss_clip": 1.04410577, + "balance_loss_mlp": 1.0175426, + "epoch": 0.5533577827210966, + "flos": 19683105937920.0, + "grad_norm": 1.7535870211720843, + "language_loss": 0.78933799, + "learning_rate": 1.7521468822154436e-06, + "loss": 0.81096601, + "num_input_tokens_seen": 99354185, + "step": 4602, + "time_per_iteration": 2.4831244945526123 + }, + { + "auxiliary_loss_clip": 0.01141231, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.04819393, + "balance_loss_mlp": 1.01973748, + "epoch": 0.5534780256117358, + "flos": 32306496076800.0, + "grad_norm": 1.82088858066226, + "language_loss": 0.74731058, + "learning_rate": 1.751373934173777e-06, + "loss": 0.76898801, + "num_input_tokens_seen": 99376930, + "step": 4603, + "time_per_iteration": 3.4246315956115723 + }, + { + "auxiliary_loss_clip": 0.01175575, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.04968834, + "balance_loss_mlp": 1.01816392, + "epoch": 0.5535982685023748, + "flos": 23222425582080.0, + "grad_norm": 1.5675869527216764, + "language_loss": 0.73273396, + "learning_rate": 1.750601023848487e-06, + "loss": 0.75474846, + "num_input_tokens_seen": 99397655, + "step": 4604, + "time_per_iteration": 2.4424405097961426 + }, + { + "auxiliary_loss_clip": 0.01174596, + "auxiliary_loss_mlp": 0.00761752, + "balance_loss_clip": 1.05228043, + "balance_loss_mlp": 1.00026774, + "epoch": 0.5537185113930139, + "flos": 24352534258560.0, + "grad_norm": 2.1650850015714878, + "language_loss": 0.73828447, + "learning_rate": 1.749828151356823e-06, + "loss": 0.75764793, + "num_input_tokens_seen": 99417850, + "step": 4605, + "time_per_iteration": 2.438225269317627 + }, + { + "auxiliary_loss_clip": 0.0114486, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.04667068, + "balance_loss_mlp": 1.02261186, + "epoch": 0.553838754283653, + "flos": 23549068886400.0, + "grad_norm": 1.6494187455371543, + "language_loss": 0.75877815, + "learning_rate": 1.7490553168160297e-06, + "loss": 0.78052294, + "num_input_tokens_seen": 99438920, + "step": 4606, + "time_per_iteration": 3.3813464641571045 + }, + { + "auxiliary_loss_clip": 0.01144062, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.0461936, + "balance_loss_mlp": 1.02014923, + "epoch": 0.5539589971742921, + "flos": 17275044205440.0, + "grad_norm": 1.8498956039126726, + "language_loss": 0.76338255, + "learning_rate": 1.748282520343345e-06, + "loss": 0.78510791, + "num_input_tokens_seen": 99457950, + "step": 4607, + "time_per_iteration": 2.4512076377868652 + }, + { + "auxiliary_loss_clip": 0.01169421, + "auxiliary_loss_mlp": 0.01022544, + "balance_loss_clip": 1.05077076, + "balance_loss_mlp": 1.01465845, + "epoch": 0.5540792400649311, + "flos": 27564169104000.0, + "grad_norm": 1.968105000801093, + "language_loss": 0.78958642, + "learning_rate": 1.7475097620560023e-06, + "loss": 0.81150603, + "num_input_tokens_seen": 99478015, + "step": 4608, + "time_per_iteration": 3.328918933868408 + }, + { + "auxiliary_loss_clip": 0.01174592, + "auxiliary_loss_mlp": 0.01022635, + "balance_loss_clip": 1.05170345, + "balance_loss_mlp": 1.01548803, + "epoch": 0.5541994829555702, + "flos": 23878657105920.0, + "grad_norm": 1.8116425445954205, + "language_loss": 0.71257412, + "learning_rate": 1.746737042071228e-06, + "loss": 0.73454636, + "num_input_tokens_seen": 99496520, + "step": 4609, + "time_per_iteration": 2.435483694076538 + }, + { + "auxiliary_loss_clip": 0.01142669, + "auxiliary_loss_mlp": 0.01022868, + "balance_loss_clip": 1.0479964, + "balance_loss_mlp": 1.01517344, + "epoch": 0.5543197258462094, + "flos": 20115721342080.0, + "grad_norm": 5.749539878932833, + "language_loss": 0.79233378, + "learning_rate": 1.7459643605062424e-06, + "loss": 0.81398916, + "num_input_tokens_seen": 99513780, + "step": 4610, + "time_per_iteration": 2.4782073497772217 + }, + { + "auxiliary_loss_clip": 0.01115797, + "auxiliary_loss_mlp": 0.01022765, + "balance_loss_clip": 1.0455904, + "balance_loss_mlp": 1.01502824, + "epoch": 0.5544399687368484, + "flos": 20916565021440.0, + "grad_norm": 1.6165793758894886, + "language_loss": 0.80664867, + "learning_rate": 1.745191717478262e-06, + "loss": 0.82803428, + "num_input_tokens_seen": 99532360, + "step": 4611, + "time_per_iteration": 2.5607051849365234 + }, + { + "auxiliary_loss_clip": 0.01143397, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.0495584, + "balance_loss_mlp": 1.02327096, + "epoch": 0.5545602116274875, + "flos": 25518661297920.0, + "grad_norm": 1.9027095472513167, + "language_loss": 0.79611576, + "learning_rate": 1.7444191131044948e-06, + "loss": 0.81785971, + "num_input_tokens_seen": 99552635, + "step": 4612, + "time_per_iteration": 3.260999917984009 + }, + { + "auxiliary_loss_clip": 0.01146826, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.04976153, + "balance_loss_mlp": 1.01980758, + "epoch": 0.5546804545181266, + "flos": 20995568985600.0, + "grad_norm": 1.7553601877609761, + "language_loss": 0.73065877, + "learning_rate": 1.7436465475021456e-06, + "loss": 0.75240827, + "num_input_tokens_seen": 99572685, + "step": 4613, + "time_per_iteration": 2.50041127204895 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01023986, + "balance_loss_clip": 1.04623449, + "balance_loss_mlp": 1.01608241, + "epoch": 0.5548006974087657, + "flos": 26833638297600.0, + "grad_norm": 1.8490591092707873, + "language_loss": 0.71849304, + "learning_rate": 1.7428740207884111e-06, + "loss": 0.73996866, + "num_input_tokens_seen": 99593565, + "step": 4614, + "time_per_iteration": 2.595573663711548 + }, + { + "auxiliary_loss_clip": 0.01119956, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.04569817, + "balance_loss_mlp": 1.01921582, + "epoch": 0.5549209402994048, + "flos": 33656414031360.0, + "grad_norm": 1.7744247723909636, + "language_loss": 0.61129081, + "learning_rate": 1.7421015330804833e-06, + "loss": 0.63275909, + "num_input_tokens_seen": 99613485, + "step": 4615, + "time_per_iteration": 2.650078296661377 + }, + { + "auxiliary_loss_clip": 0.01174529, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.0511719, + "balance_loss_mlp": 1.01981258, + "epoch": 0.5550411831900439, + "flos": 23769524609280.0, + "grad_norm": 1.7454185104715776, + "language_loss": 0.72400331, + "learning_rate": 1.7413290844955475e-06, + "loss": 0.74602151, + "num_input_tokens_seen": 99633515, + "step": 4616, + "time_per_iteration": 2.481126070022583 + }, + { + "auxiliary_loss_clip": 0.01155513, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.05171251, + "balance_loss_mlp": 1.02044129, + "epoch": 0.555161426080683, + "flos": 21651189978240.0, + "grad_norm": 1.8683043543788287, + "language_loss": 0.78068757, + "learning_rate": 1.7405566751507843e-06, + "loss": 0.80251938, + "num_input_tokens_seen": 99651560, + "step": 4617, + "time_per_iteration": 2.460554838180542 + }, + { + "auxiliary_loss_clip": 0.01131125, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.04484153, + "balance_loss_mlp": 1.02599287, + "epoch": 0.555281668971322, + "flos": 49563116605440.0, + "grad_norm": 1.451190297533999, + "language_loss": 0.67608857, + "learning_rate": 1.7397843051633668e-06, + "loss": 0.69772977, + "num_input_tokens_seen": 99674255, + "step": 4618, + "time_per_iteration": 2.8118202686309814 + }, + { + "auxiliary_loss_clip": 0.01155112, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.04858947, + "balance_loss_mlp": 1.01961231, + "epoch": 0.5554019118619612, + "flos": 20741608851840.0, + "grad_norm": 1.6997013659868998, + "language_loss": 0.71361554, + "learning_rate": 1.739011974650464e-06, + "loss": 0.73543972, + "num_input_tokens_seen": 99693585, + "step": 4619, + "time_per_iteration": 2.458878517150879 + }, + { + "auxiliary_loss_clip": 0.01122592, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.04540467, + "balance_loss_mlp": 1.01944089, + "epoch": 0.5555221547526003, + "flos": 25483217552640.0, + "grad_norm": 3.1959499183109434, + "language_loss": 0.76647395, + "learning_rate": 1.7382396837292365e-06, + "loss": 0.78797162, + "num_input_tokens_seen": 99714045, + "step": 4620, + "time_per_iteration": 2.6123523712158203 + }, + { + "auxiliary_loss_clip": 0.01175055, + "auxiliary_loss_mlp": 0.01022097, + "balance_loss_clip": 1.05240893, + "balance_loss_mlp": 1.01437187, + "epoch": 0.5556423976432393, + "flos": 21762513204480.0, + "grad_norm": 1.8926166248520258, + "language_loss": 0.73393691, + "learning_rate": 1.737467432516841e-06, + "loss": 0.75590849, + "num_input_tokens_seen": 99734145, + "step": 4621, + "time_per_iteration": 2.434607982635498 + }, + { + "auxiliary_loss_clip": 0.01144136, + "auxiliary_loss_mlp": 0.01024403, + "balance_loss_clip": 1.04441214, + "balance_loss_mlp": 1.0169735, + "epoch": 0.5557626405338785, + "flos": 24900171989760.0, + "grad_norm": 2.30868781223006, + "language_loss": 0.74419034, + "learning_rate": 1.7366952211304274e-06, + "loss": 0.7658757, + "num_input_tokens_seen": 99751990, + "step": 4622, + "time_per_iteration": 2.510425329208374 + }, + { + "auxiliary_loss_clip": 0.01139126, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.04648924, + "balance_loss_mlp": 1.01780653, + "epoch": 0.5558828834245175, + "flos": 18697501676160.0, + "grad_norm": 2.0231673981919944, + "language_loss": 0.83958733, + "learning_rate": 1.735923049687139e-06, + "loss": 0.86123341, + "num_input_tokens_seen": 99768565, + "step": 4623, + "time_per_iteration": 2.466840982437134 + }, + { + "auxiliary_loss_clip": 0.01141118, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.04664481, + "balance_loss_mlp": 1.01798511, + "epoch": 0.5560031263151566, + "flos": 27272179445760.0, + "grad_norm": 1.4503057712483887, + "language_loss": 0.73727715, + "learning_rate": 1.7351509183041144e-06, + "loss": 0.75894141, + "num_input_tokens_seen": 99788895, + "step": 4624, + "time_per_iteration": 2.5360922813415527 + }, + { + "auxiliary_loss_clip": 0.01176778, + "auxiliary_loss_mlp": 0.01023867, + "balance_loss_clip": 1.05155659, + "balance_loss_mlp": 1.01664829, + "epoch": 0.5561233692057957, + "flos": 23403738458880.0, + "grad_norm": 1.6291736817877638, + "language_loss": 0.71514523, + "learning_rate": 1.7343788270984852e-06, + "loss": 0.73715168, + "num_input_tokens_seen": 99808035, + "step": 4625, + "time_per_iteration": 2.441911458969116 + }, + { + "auxiliary_loss_clip": 0.01143773, + "auxiliary_loss_mlp": 0.01023015, + "balance_loss_clip": 1.0490911, + "balance_loss_mlp": 1.01510882, + "epoch": 0.5562436120964348, + "flos": 37670867804160.0, + "grad_norm": 1.8309422499479109, + "language_loss": 0.74689662, + "learning_rate": 1.7336067761873764e-06, + "loss": 0.76856446, + "num_input_tokens_seen": 99830460, + "step": 4626, + "time_per_iteration": 2.6328115463256836 + }, + { + "auxiliary_loss_clip": 0.01169926, + "auxiliary_loss_mlp": 0.01029226, + "balance_loss_clip": 1.05162418, + "balance_loss_mlp": 1.02109575, + "epoch": 0.5563638549870739, + "flos": 25155245445120.0, + "grad_norm": 1.9123073967380806, + "language_loss": 0.76350218, + "learning_rate": 1.7328347656879076e-06, + "loss": 0.78549373, + "num_input_tokens_seen": 99850320, + "step": 4627, + "time_per_iteration": 2.504725694656372 + }, + { + "auxiliary_loss_clip": 0.01132678, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.04657364, + "balance_loss_mlp": 1.01391721, + "epoch": 0.556484097877713, + "flos": 13581810783360.0, + "grad_norm": 2.270634774073174, + "language_loss": 0.67934072, + "learning_rate": 1.7320627957171927e-06, + "loss": 0.70088696, + "num_input_tokens_seen": 99864980, + "step": 4628, + "time_per_iteration": 2.4655189514160156 + }, + { + "auxiliary_loss_clip": 0.01176321, + "auxiliary_loss_mlp": 0.01024437, + "balance_loss_clip": 1.05267274, + "balance_loss_mlp": 1.01743913, + "epoch": 0.5566043407683521, + "flos": 24681368292480.0, + "grad_norm": 4.885454474018997, + "language_loss": 0.8158983, + "learning_rate": 1.7312908663923382e-06, + "loss": 0.83790582, + "num_input_tokens_seen": 99881155, + "step": 4629, + "time_per_iteration": 2.449221134185791 + }, + { + "auxiliary_loss_clip": 0.01155254, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.04774356, + "balance_loss_mlp": 1.01722527, + "epoch": 0.5567245836589911, + "flos": 20588161950720.0, + "grad_norm": 2.6351390872886213, + "language_loss": 0.67132974, + "learning_rate": 1.7305189778304463e-06, + "loss": 0.69313288, + "num_input_tokens_seen": 99899330, + "step": 4630, + "time_per_iteration": 3.199492931365967 + }, + { + "auxiliary_loss_clip": 0.01149082, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.02062833, + "epoch": 0.5568448265496303, + "flos": 20704189858560.0, + "grad_norm": 1.7926391479239319, + "language_loss": 0.79410577, + "learning_rate": 1.729747130148611e-06, + "loss": 0.81587911, + "num_input_tokens_seen": 99918525, + "step": 4631, + "time_per_iteration": 2.484035015106201 + }, + { + "auxiliary_loss_clip": 0.01138729, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.0478282, + "balance_loss_mlp": 1.01826048, + "epoch": 0.5569650694402694, + "flos": 25302910256640.0, + "grad_norm": 1.8188745188738866, + "language_loss": 0.76582205, + "learning_rate": 1.7289753234639208e-06, + "loss": 0.78747523, + "num_input_tokens_seen": 99937500, + "step": 4632, + "time_per_iteration": 2.560176372528076 + }, + { + "auxiliary_loss_clip": 0.01167092, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.05225813, + "balance_loss_mlp": 1.01866245, + "epoch": 0.5570853123309084, + "flos": 19712623939200.0, + "grad_norm": 3.534945863279641, + "language_loss": 0.76477939, + "learning_rate": 1.7282035578934592e-06, + "loss": 0.78671527, + "num_input_tokens_seen": 99955665, + "step": 4633, + "time_per_iteration": 3.321467161178589 + }, + { + "auxiliary_loss_clip": 0.01140989, + "auxiliary_loss_mlp": 0.01036001, + "balance_loss_clip": 1.05089867, + "balance_loss_mlp": 1.02868712, + "epoch": 0.5572055552215476, + "flos": 16108091153280.0, + "grad_norm": 1.8059749404313463, + "language_loss": 0.78944975, + "learning_rate": 1.727431833554301e-06, + "loss": 0.81121969, + "num_input_tokens_seen": 99974140, + "step": 4634, + "time_per_iteration": 2.521886110305786 + }, + { + "auxiliary_loss_clip": 0.01114021, + "auxiliary_loss_mlp": 0.01024146, + "balance_loss_clip": 1.04610968, + "balance_loss_mlp": 1.01646328, + "epoch": 0.5573257981121866, + "flos": 17128815937920.0, + "grad_norm": 1.791886050918992, + "language_loss": 0.77460384, + "learning_rate": 1.7266601505635175e-06, + "loss": 0.79598558, + "num_input_tokens_seen": 99991480, + "step": 4635, + "time_per_iteration": 3.4280803203582764 + }, + { + "auxiliary_loss_clip": 0.01162694, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.05179167, + "balance_loss_mlp": 1.01921773, + "epoch": 0.5574460410028257, + "flos": 18807029222400.0, + "grad_norm": 2.4125204917444436, + "language_loss": 0.75997543, + "learning_rate": 1.7258885090381717e-06, + "loss": 0.7818712, + "num_input_tokens_seen": 100009520, + "step": 4636, + "time_per_iteration": 2.451374053955078 + }, + { + "auxiliary_loss_clip": 0.01150358, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.04871297, + "balance_loss_mlp": 1.02099407, + "epoch": 0.5575662838934649, + "flos": 29642678530560.0, + "grad_norm": 1.8321429816259176, + "language_loss": 0.7845186, + "learning_rate": 1.7251169090953213e-06, + "loss": 0.80630314, + "num_input_tokens_seen": 100029995, + "step": 4637, + "time_per_iteration": 2.5829689502716064 + }, + { + "auxiliary_loss_clip": 0.01161407, + "auxiliary_loss_mlp": 0.01024314, + "balance_loss_clip": 1.04985428, + "balance_loss_mlp": 1.01630306, + "epoch": 0.5576865267841039, + "flos": 22054466949120.0, + "grad_norm": 2.5592453917876905, + "language_loss": 0.75951588, + "learning_rate": 1.7243453508520168e-06, + "loss": 0.78137308, + "num_input_tokens_seen": 100046980, + "step": 4638, + "time_per_iteration": 3.157209634780884 + }, + { + "auxiliary_loss_clip": 0.01147754, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04773486, + "balance_loss_mlp": 1.02013993, + "epoch": 0.557806769674743, + "flos": 17196040241280.0, + "grad_norm": 1.8866592588179323, + "language_loss": 0.8457588, + "learning_rate": 1.7235738344253038e-06, + "loss": 0.86751521, + "num_input_tokens_seen": 100060610, + "step": 4639, + "time_per_iteration": 2.4657938480377197 + }, + { + "auxiliary_loss_clip": 0.01164245, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.055089, + "balance_loss_mlp": 1.01990926, + "epoch": 0.557927012565382, + "flos": 24712717887360.0, + "grad_norm": 3.1801102619615538, + "language_loss": 0.82909626, + "learning_rate": 1.72280235993222e-06, + "loss": 0.85101902, + "num_input_tokens_seen": 100078915, + "step": 4640, + "time_per_iteration": 2.508254051208496 + }, + { + "auxiliary_loss_clip": 0.01157918, + "auxiliary_loss_mlp": 0.00762758, + "balance_loss_clip": 1.04921222, + "balance_loss_mlp": 1.00029778, + "epoch": 0.5580472554560212, + "flos": 16983090460800.0, + "grad_norm": 2.2492420488409355, + "language_loss": 0.69526637, + "learning_rate": 1.722030927489798e-06, + "loss": 0.71447319, + "num_input_tokens_seen": 100096195, + "step": 4641, + "time_per_iteration": 2.440819501876831 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01022272, + "balance_loss_clip": 1.05016398, + "balance_loss_mlp": 1.01445508, + "epoch": 0.5581674983466602, + "flos": 23509100027520.0, + "grad_norm": 1.6734723263534574, + "language_loss": 0.74127352, + "learning_rate": 1.7212595372150634e-06, + "loss": 0.76285827, + "num_input_tokens_seen": 100116175, + "step": 4642, + "time_per_iteration": 2.5422399044036865 + }, + { + "auxiliary_loss_clip": 0.01178055, + "auxiliary_loss_mlp": 0.01025011, + "balance_loss_clip": 1.05357552, + "balance_loss_mlp": 1.01781082, + "epoch": 0.5582877412372993, + "flos": 13480291969920.0, + "grad_norm": 2.3068966582922714, + "language_loss": 0.72841978, + "learning_rate": 1.720488189225035e-06, + "loss": 0.75045049, + "num_input_tokens_seen": 100133875, + "step": 4643, + "time_per_iteration": 2.3839259147644043 + }, + { + "auxiliary_loss_clip": 0.01163627, + "auxiliary_loss_mlp": 0.010264, + "balance_loss_clip": 1.05009913, + "balance_loss_mlp": 1.01868665, + "epoch": 0.5584079841279385, + "flos": 21903605827200.0, + "grad_norm": 2.1320329244769236, + "language_loss": 0.79387081, + "learning_rate": 1.7197168836367265e-06, + "loss": 0.8157711, + "num_input_tokens_seen": 100150685, + "step": 4644, + "time_per_iteration": 2.4428772926330566 + }, + { + "auxiliary_loss_clip": 0.01157057, + "auxiliary_loss_mlp": 0.00762141, + "balance_loss_clip": 1.04720819, + "balance_loss_mlp": 1.00025892, + "epoch": 0.5585282270185775, + "flos": 18843550375680.0, + "grad_norm": 1.8273082612237137, + "language_loss": 0.81958282, + "learning_rate": 1.7189456205671433e-06, + "loss": 0.8387748, + "num_input_tokens_seen": 100169530, + "step": 4645, + "time_per_iteration": 2.4554295539855957 + }, + { + "auxiliary_loss_clip": 0.01170389, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.05173147, + "balance_loss_mlp": 1.01820397, + "epoch": 0.5586484699092166, + "flos": 21868449390720.0, + "grad_norm": 1.7662071525388225, + "language_loss": 0.82310963, + "learning_rate": 1.7181744001332866e-06, + "loss": 0.84507644, + "num_input_tokens_seen": 100188140, + "step": 4646, + "time_per_iteration": 2.4654436111450195 + }, + { + "auxiliary_loss_clip": 0.0117644, + "auxiliary_loss_mlp": 0.01025342, + "balance_loss_clip": 1.05408692, + "balance_loss_mlp": 1.01763546, + "epoch": 0.5587687127998557, + "flos": 22893232412160.0, + "grad_norm": 1.7128976657847303, + "language_loss": 0.63498676, + "learning_rate": 1.7174032224521493e-06, + "loss": 0.65700454, + "num_input_tokens_seen": 100206850, + "step": 4647, + "time_per_iteration": 2.4171664714813232 + }, + { + "auxiliary_loss_clip": 0.01161559, + "auxiliary_loss_mlp": 0.01026541, + "balance_loss_clip": 1.05078363, + "balance_loss_mlp": 1.01918876, + "epoch": 0.5588889556904948, + "flos": 20303067703680.0, + "grad_norm": 1.649069331894175, + "language_loss": 0.69753414, + "learning_rate": 1.7166320876407184e-06, + "loss": 0.71941519, + "num_input_tokens_seen": 100226270, + "step": 4648, + "time_per_iteration": 2.5370969772338867 + }, + { + "auxiliary_loss_clip": 0.01179676, + "auxiliary_loss_mlp": 0.00762517, + "balance_loss_clip": 1.0536679, + "balance_loss_mlp": 1.00035954, + "epoch": 0.5590091985811338, + "flos": 16472153450880.0, + "grad_norm": 2.00652214296276, + "language_loss": 0.67454857, + "learning_rate": 1.7158609958159742e-06, + "loss": 0.6939705, + "num_input_tokens_seen": 100243675, + "step": 4649, + "time_per_iteration": 2.410348653793335 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.04787612, + "balance_loss_mlp": 1.02455831, + "epoch": 0.559129441471773, + "flos": 14532186781440.0, + "grad_norm": 5.473230571034753, + "language_loss": 0.78052616, + "learning_rate": 1.7150899470948911e-06, + "loss": 0.80203521, + "num_input_tokens_seen": 100258940, + "step": 4650, + "time_per_iteration": 2.5439555644989014 + }, + { + "auxiliary_loss_clip": 0.01056703, + "auxiliary_loss_mlp": 0.01001233, + "balance_loss_clip": 1.0197506, + "balance_loss_mlp": 1.00019574, + "epoch": 0.5592496843624121, + "flos": 60521009852160.0, + "grad_norm": 0.7966790626454702, + "language_loss": 0.56700563, + "learning_rate": 1.7143189415944365e-06, + "loss": 0.58758503, + "num_input_tokens_seen": 100323400, + "step": 4651, + "time_per_iteration": 3.0973634719848633 + }, + { + "auxiliary_loss_clip": 0.01162009, + "auxiliary_loss_mlp": 0.01025623, + "balance_loss_clip": 1.05181646, + "balance_loss_mlp": 1.01738596, + "epoch": 0.5593699272530511, + "flos": 20886256920960.0, + "grad_norm": 1.632918942786101, + "language_loss": 0.7624889, + "learning_rate": 1.7135479794315714e-06, + "loss": 0.78436518, + "num_input_tokens_seen": 100340355, + "step": 4652, + "time_per_iteration": 2.449916362762451 + }, + { + "auxiliary_loss_clip": 0.01132379, + "auxiliary_loss_mlp": 0.01022603, + "balance_loss_clip": 1.04853582, + "balance_loss_mlp": 1.01524162, + "epoch": 0.5594901701436903, + "flos": 12896743616640.0, + "grad_norm": 1.918269431886607, + "language_loss": 0.79292631, + "learning_rate": 1.7127770607232502e-06, + "loss": 0.81447613, + "num_input_tokens_seen": 100358900, + "step": 4653, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01141167, + "auxiliary_loss_mlp": 0.01024676, + "balance_loss_clip": 1.04892516, + "balance_loss_mlp": 1.01671898, + "epoch": 0.5596104130343293, + "flos": 23112107936640.0, + "grad_norm": 20.5625506750041, + "language_loss": 0.79580224, + "learning_rate": 1.7120061855864204e-06, + "loss": 0.81746072, + "num_input_tokens_seen": 100378910, + "step": 4654, + "time_per_iteration": 2.54665470123291 + }, + { + "auxiliary_loss_clip": 0.01164816, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.05463767, + "balance_loss_mlp": 1.02052474, + "epoch": 0.5597306559249684, + "flos": 25957812977280.0, + "grad_norm": 5.0736947359311095, + "language_loss": 0.70961249, + "learning_rate": 1.7112353541380233e-06, + "loss": 0.73154372, + "num_input_tokens_seen": 100398770, + "step": 4655, + "time_per_iteration": 2.4963390827178955 + }, + { + "auxiliary_loss_clip": 0.0115177, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.05194855, + "balance_loss_mlp": 1.02184474, + "epoch": 0.5598508988156076, + "flos": 22492289825280.0, + "grad_norm": 1.487328339271208, + "language_loss": 0.72375858, + "learning_rate": 1.7104645664949931e-06, + "loss": 0.74558198, + "num_input_tokens_seen": 100421240, + "step": 4656, + "time_per_iteration": 3.320931911468506 + }, + { + "auxiliary_loss_clip": 0.01151403, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.04784155, + "balance_loss_mlp": 1.01947641, + "epoch": 0.5599711417062466, + "flos": 23112538899840.0, + "grad_norm": 2.0556188410887994, + "language_loss": 0.71332437, + "learning_rate": 1.7096938227742584e-06, + "loss": 0.73511606, + "num_input_tokens_seen": 100442370, + "step": 4657, + "time_per_iteration": 2.521167039871216 + }, + { + "auxiliary_loss_clip": 0.01178348, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.05337214, + "balance_loss_mlp": 1.01842594, + "epoch": 0.5600913845968857, + "flos": 22339345714560.0, + "grad_norm": 1.8397435219302765, + "language_loss": 0.84366721, + "learning_rate": 1.70892312309274e-06, + "loss": 0.86571342, + "num_input_tokens_seen": 100460260, + "step": 4658, + "time_per_iteration": 2.433173894882202 + }, + { + "auxiliary_loss_clip": 0.01147959, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.04343128, + "balance_loss_mlp": 1.01782179, + "epoch": 0.5602116274875248, + "flos": 17633791290240.0, + "grad_norm": 2.975392959743797, + "language_loss": 0.68112212, + "learning_rate": 1.7081524675673523e-06, + "loss": 0.70286107, + "num_input_tokens_seen": 100475750, + "step": 4659, + "time_per_iteration": 2.4482812881469727 + }, + { + "auxiliary_loss_clip": 0.01056263, + "auxiliary_loss_mlp": 0.0100064, + "balance_loss_clip": 1.01519918, + "balance_loss_mlp": 0.99962026, + "epoch": 0.5603318703781639, + "flos": 70115945529600.0, + "grad_norm": 0.7713027304735945, + "language_loss": 0.5960052, + "learning_rate": 1.7073818563150026e-06, + "loss": 0.61657417, + "num_input_tokens_seen": 100537830, + "step": 4660, + "time_per_iteration": 3.9954657554626465 + }, + { + "auxiliary_loss_clip": 0.01159389, + "auxiliary_loss_mlp": 0.01025404, + "balance_loss_clip": 1.04952502, + "balance_loss_mlp": 1.01765525, + "epoch": 0.560452113268803, + "flos": 18545850455040.0, + "grad_norm": 2.1787942141850234, + "language_loss": 0.86644322, + "learning_rate": 1.7066112894525935e-06, + "loss": 0.88829112, + "num_input_tokens_seen": 100555910, + "step": 4661, + "time_per_iteration": 2.4842474460601807 + }, + { + "auxiliary_loss_clip": 0.01141045, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04810047, + "balance_loss_mlp": 1.02371418, + "epoch": 0.5605723561594421, + "flos": 25264665250560.0, + "grad_norm": 1.6296382630935142, + "language_loss": 0.72843611, + "learning_rate": 1.7058407670970177e-06, + "loss": 0.75016201, + "num_input_tokens_seen": 100577385, + "step": 4662, + "time_per_iteration": 3.357377529144287 + }, + { + "auxiliary_loss_clip": 0.01167646, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.05021274, + "balance_loss_mlp": 1.02133441, + "epoch": 0.5606925990500812, + "flos": 20594949621120.0, + "grad_norm": 1.6277637897738084, + "language_loss": 0.60955441, + "learning_rate": 1.7050702893651643e-06, + "loss": 0.63152575, + "num_input_tokens_seen": 100596965, + "step": 4663, + "time_per_iteration": 2.4452908039093018 + }, + { + "auxiliary_loss_clip": 0.01163357, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.05200791, + "balance_loss_mlp": 1.01842904, + "epoch": 0.5608128419407202, + "flos": 35006044677120.0, + "grad_norm": 2.5419460593517735, + "language_loss": 0.76049721, + "learning_rate": 1.7042998563739134e-06, + "loss": 0.78239572, + "num_input_tokens_seen": 100615315, + "step": 4664, + "time_per_iteration": 2.560274362564087 + }, + { + "auxiliary_loss_clip": 0.01155389, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.04751158, + "balance_loss_mlp": 1.0241617, + "epoch": 0.5609330848313594, + "flos": 24639819235200.0, + "grad_norm": 2.256913655105567, + "language_loss": 0.71309018, + "learning_rate": 1.703529468240139e-06, + "loss": 0.73496658, + "num_input_tokens_seen": 100634185, + "step": 4665, + "time_per_iteration": 3.2564074993133545 + }, + { + "auxiliary_loss_clip": 0.01142823, + "auxiliary_loss_mlp": 0.01027203, + "balance_loss_clip": 1.05012894, + "balance_loss_mlp": 1.01922441, + "epoch": 0.5610533277219985, + "flos": 18762894385920.0, + "grad_norm": 2.372843300588213, + "language_loss": 0.74257457, + "learning_rate": 1.7027591250807088e-06, + "loss": 0.76427484, + "num_input_tokens_seen": 100651360, + "step": 4666, + "time_per_iteration": 2.4598982334136963 + }, + { + "auxiliary_loss_clip": 0.01181383, + "auxiliary_loss_mlp": 0.01026381, + "balance_loss_clip": 1.05544686, + "balance_loss_mlp": 1.01853704, + "epoch": 0.5611735706126375, + "flos": 15012384727680.0, + "grad_norm": 2.4330320735330004, + "language_loss": 0.84826428, + "learning_rate": 1.7019888270124825e-06, + "loss": 0.8703419, + "num_input_tokens_seen": 100668525, + "step": 4667, + "time_per_iteration": 2.3941941261291504 + }, + { + "auxiliary_loss_clip": 0.01169649, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.05386961, + "balance_loss_mlp": 1.02099109, + "epoch": 0.5612938135032767, + "flos": 16468167041280.0, + "grad_norm": 1.7602220885835493, + "language_loss": 0.82137924, + "learning_rate": 1.7012185741523147e-06, + "loss": 0.84337234, + "num_input_tokens_seen": 100684850, + "step": 4668, + "time_per_iteration": 2.4194586277008057 + }, + { + "auxiliary_loss_clip": 0.01180237, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.05491328, + "balance_loss_mlp": 1.02130532, + "epoch": 0.5614140563939157, + "flos": 25666433850240.0, + "grad_norm": 2.8924875264821504, + "language_loss": 0.62510872, + "learning_rate": 1.7004483666170514e-06, + "loss": 0.64720142, + "num_input_tokens_seen": 100705345, + "step": 4669, + "time_per_iteration": 2.4603211879730225 + }, + { + "auxiliary_loss_clip": 0.01163957, + "auxiliary_loss_mlp": 0.01026964, + "balance_loss_clip": 1.05115509, + "balance_loss_mlp": 1.01964474, + "epoch": 0.5615342992845548, + "flos": 24717566223360.0, + "grad_norm": 2.130757627893791, + "language_loss": 0.80699134, + "learning_rate": 1.699678204523533e-06, + "loss": 0.82890058, + "num_input_tokens_seen": 100725210, + "step": 4670, + "time_per_iteration": 2.473994731903076 + }, + { + "auxiliary_loss_clip": 0.01154442, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.05346835, + "balance_loss_mlp": 1.02069354, + "epoch": 0.5616545421751938, + "flos": 22015934634240.0, + "grad_norm": 3.787960458819434, + "language_loss": 0.69138598, + "learning_rate": 1.6989080879885918e-06, + "loss": 0.71322536, + "num_input_tokens_seen": 100743070, + "step": 4671, + "time_per_iteration": 2.504103660583496 + }, + { + "auxiliary_loss_clip": 0.01042694, + "auxiliary_loss_mlp": 0.0100182, + "balance_loss_clip": 1.01499963, + "balance_loss_mlp": 1.00078893, + "epoch": 0.561774785065833, + "flos": 53760358690560.0, + "grad_norm": 0.9010901182194945, + "language_loss": 0.61043864, + "learning_rate": 1.6981380171290544e-06, + "loss": 0.63088381, + "num_input_tokens_seen": 100804095, + "step": 4672, + "time_per_iteration": 3.06972074508667 + }, + { + "auxiliary_loss_clip": 0.01145017, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.04662192, + "balance_loss_mlp": 1.02015662, + "epoch": 0.5618950279564721, + "flos": 19750007018880.0, + "grad_norm": 1.9095044973685826, + "language_loss": 0.74349374, + "learning_rate": 1.6973679920617396e-06, + "loss": 0.76522464, + "num_input_tokens_seen": 100821630, + "step": 4673, + "time_per_iteration": 2.4905998706817627 + }, + { + "auxiliary_loss_clip": 0.01147745, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.05116308, + "balance_loss_mlp": 1.01955128, + "epoch": 0.5620152708471111, + "flos": 16800592435200.0, + "grad_norm": 2.422210034430206, + "language_loss": 0.85206586, + "learning_rate": 1.6965980129034603e-06, + "loss": 0.87381947, + "num_input_tokens_seen": 100839015, + "step": 4674, + "time_per_iteration": 2.469625473022461 + }, + { + "auxiliary_loss_clip": 0.01152973, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.0539403, + "balance_loss_mlp": 1.01809764, + "epoch": 0.5621355137377503, + "flos": 26797799502720.0, + "grad_norm": 1.5154359158780253, + "language_loss": 0.76440513, + "learning_rate": 1.6958280797710209e-06, + "loss": 0.78619361, + "num_input_tokens_seen": 100860940, + "step": 4675, + "time_per_iteration": 2.5471930503845215 + }, + { + "auxiliary_loss_clip": 0.01053591, + "auxiliary_loss_mlp": 0.01002585, + "balance_loss_clip": 1.01535988, + "balance_loss_mlp": 1.00156581, + "epoch": 0.5622557566283893, + "flos": 61207046686080.0, + "grad_norm": 0.7154377444825417, + "language_loss": 0.54814363, + "learning_rate": 1.6950581927812198e-06, + "loss": 0.56870538, + "num_input_tokens_seen": 100920510, + "step": 4676, + "time_per_iteration": 2.943524122238159 + }, + { + "auxiliary_loss_clip": 0.01164336, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.05168009, + "balance_loss_mlp": 1.01879597, + "epoch": 0.5623759995190284, + "flos": 26468534505600.0, + "grad_norm": 2.952573418583003, + "language_loss": 0.79496479, + "learning_rate": 1.6942883520508486e-06, + "loss": 0.81687587, + "num_input_tokens_seen": 100939245, + "step": 4677, + "time_per_iteration": 2.500335931777954 + }, + { + "auxiliary_loss_clip": 0.01164734, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.05082297, + "balance_loss_mlp": 1.01802731, + "epoch": 0.5624962424096676, + "flos": 19390900798080.0, + "grad_norm": 2.0054358670043393, + "language_loss": 0.77308154, + "learning_rate": 1.693518557696691e-06, + "loss": 0.79498887, + "num_input_tokens_seen": 100958385, + "step": 4678, + "time_per_iteration": 2.434480667114258 + }, + { + "auxiliary_loss_clip": 0.01159946, + "auxiliary_loss_mlp": 0.01024886, + "balance_loss_clip": 1.04851234, + "balance_loss_mlp": 1.01710153, + "epoch": 0.5626164853003066, + "flos": 20667345482880.0, + "grad_norm": 2.010738097474918, + "language_loss": 0.88986361, + "learning_rate": 1.6927488098355252e-06, + "loss": 0.91171193, + "num_input_tokens_seen": 100976015, + "step": 4679, + "time_per_iteration": 2.4582018852233887 + }, + { + "auxiliary_loss_clip": 0.01038544, + "auxiliary_loss_mlp": 0.01003137, + "balance_loss_clip": 1.01413584, + "balance_loss_mlp": 1.00193274, + "epoch": 0.5627367281909457, + "flos": 62766071665920.0, + "grad_norm": 0.9094488422671037, + "language_loss": 0.63175243, + "learning_rate": 1.6919791085841201e-06, + "loss": 0.65216923, + "num_input_tokens_seen": 101033425, + "step": 4680, + "time_per_iteration": 3.0931854248046875 + }, + { + "auxiliary_loss_clip": 0.0115805, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.04772329, + "balance_loss_mlp": 1.0214572, + "epoch": 0.5628569710815848, + "flos": 12787144243200.0, + "grad_norm": 2.2988181829739522, + "language_loss": 0.78827047, + "learning_rate": 1.6912094540592396e-06, + "loss": 0.81015313, + "num_input_tokens_seen": 101048945, + "step": 4681, + "time_per_iteration": 2.4543240070343018 + }, + { + "auxiliary_loss_clip": 0.01162639, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.05067933, + "balance_loss_mlp": 1.02122474, + "epoch": 0.5629772139722239, + "flos": 13762082165760.0, + "grad_norm": 5.605968390907182, + "language_loss": 0.81280792, + "learning_rate": 1.6904398463776393e-06, + "loss": 0.83472401, + "num_input_tokens_seen": 101062745, + "step": 4682, + "time_per_iteration": 2.3975889682769775 + }, + { + "auxiliary_loss_clip": 0.0116499, + "auxiliary_loss_mlp": 0.01025936, + "balance_loss_clip": 1.0495671, + "balance_loss_mlp": 1.01804996, + "epoch": 0.5630974568628629, + "flos": 21467830026240.0, + "grad_norm": 1.7068709996142475, + "language_loss": 0.72629416, + "learning_rate": 1.6896702856560683e-06, + "loss": 0.7482034, + "num_input_tokens_seen": 101081840, + "step": 4683, + "time_per_iteration": 3.284876585006714 + }, + { + "auxiliary_loss_clip": 0.01133642, + "auxiliary_loss_mlp": 0.01025367, + "balance_loss_clip": 1.04509115, + "balance_loss_mlp": 1.01748109, + "epoch": 0.5632176997535021, + "flos": 14245907385600.0, + "grad_norm": 2.825214411476421, + "language_loss": 0.69354212, + "learning_rate": 1.6889007720112677e-06, + "loss": 0.71513218, + "num_input_tokens_seen": 101099585, + "step": 4684, + "time_per_iteration": 2.4920742511749268 + }, + { + "auxiliary_loss_clip": 0.01167496, + "auxiliary_loss_mlp": 0.01026046, + "balance_loss_clip": 1.05389929, + "balance_loss_mlp": 1.01875997, + "epoch": 0.5633379426441412, + "flos": 20812244947200.0, + "grad_norm": 1.5469558476558942, + "language_loss": 0.77391911, + "learning_rate": 1.6881313055599734e-06, + "loss": 0.79585457, + "num_input_tokens_seen": 101119515, + "step": 4685, + "time_per_iteration": 2.456150531768799 + }, + { + "auxiliary_loss_clip": 0.01136625, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.04566884, + "balance_loss_mlp": 1.01486015, + "epoch": 0.5634581855347802, + "flos": 22600883617920.0, + "grad_norm": 2.4028321410814883, + "language_loss": 0.82260096, + "learning_rate": 1.6873618864189117e-06, + "loss": 0.84419894, + "num_input_tokens_seen": 101135285, + "step": 4686, + "time_per_iteration": 3.324130058288574 + }, + { + "auxiliary_loss_clip": 0.01163568, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.05002165, + "balance_loss_mlp": 1.02490616, + "epoch": 0.5635784284254194, + "flos": 21506972872320.0, + "grad_norm": 2.2035073149545683, + "language_loss": 0.7750845, + "learning_rate": 1.686592514704803e-06, + "loss": 0.79705203, + "num_input_tokens_seen": 101152680, + "step": 4687, + "time_per_iteration": 2.4573662281036377 + }, + { + "auxiliary_loss_clip": 0.01149164, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.05315936, + "balance_loss_mlp": 1.01918936, + "epoch": 0.5636986713160584, + "flos": 19827466698240.0, + "grad_norm": 2.2018676842780556, + "language_loss": 0.70977378, + "learning_rate": 1.685823190534361e-06, + "loss": 0.73152965, + "num_input_tokens_seen": 101170920, + "step": 4688, + "time_per_iteration": 2.476706027984619 + }, + { + "auxiliary_loss_clip": 0.01181615, + "auxiliary_loss_mlp": 0.01024859, + "balance_loss_clip": 1.05358446, + "balance_loss_mlp": 1.016204, + "epoch": 0.5638189142066975, + "flos": 19792453916160.0, + "grad_norm": 1.8715929235791309, + "language_loss": 0.83937359, + "learning_rate": 1.6850539140242907e-06, + "loss": 0.86143827, + "num_input_tokens_seen": 101190180, + "step": 4689, + "time_per_iteration": 3.2551026344299316 + }, + { + "auxiliary_loss_clip": 0.01167918, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.05150843, + "balance_loss_mlp": 1.0233779, + "epoch": 0.5639391570973367, + "flos": 22893771116160.0, + "grad_norm": 1.882163472555844, + "language_loss": 0.81961226, + "learning_rate": 1.684284685291292e-06, + "loss": 0.84160215, + "num_input_tokens_seen": 101211825, + "step": 4690, + "time_per_iteration": 2.471123695373535 + }, + { + "auxiliary_loss_clip": 0.01179296, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.05327809, + "balance_loss_mlp": 1.02306604, + "epoch": 0.5640593999879757, + "flos": 23727077712000.0, + "grad_norm": 1.8887818603786286, + "language_loss": 0.81538939, + "learning_rate": 1.683515504452055e-06, + "loss": 0.83749378, + "num_input_tokens_seen": 101229200, + "step": 4691, + "time_per_iteration": 3.263162612915039 + }, + { + "auxiliary_loss_clip": 0.0112631, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.04475379, + "balance_loss_mlp": 1.02512097, + "epoch": 0.5641796428786148, + "flos": 22710123855360.0, + "grad_norm": 2.11691359539167, + "language_loss": 0.66213882, + "learning_rate": 1.6827463716232648e-06, + "loss": 0.68374312, + "num_input_tokens_seen": 101249860, + "step": 4692, + "time_per_iteration": 2.557039260864258 + }, + { + "auxiliary_loss_clip": 0.01162309, + "auxiliary_loss_mlp": 0.00762565, + "balance_loss_clip": 1.05038953, + "balance_loss_mlp": 1.00035405, + "epoch": 0.5642998857692539, + "flos": 19791987039360.0, + "grad_norm": 2.8169886462108717, + "language_loss": 0.75555813, + "learning_rate": 1.6819772869215972e-06, + "loss": 0.77480686, + "num_input_tokens_seen": 101268940, + "step": 4693, + "time_per_iteration": 2.4532949924468994 + }, + { + "auxiliary_loss_clip": 0.01155663, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.05228376, + "balance_loss_mlp": 1.02092314, + "epoch": 0.564420128659893, + "flos": 23185904428800.0, + "grad_norm": 1.8697942062197417, + "language_loss": 0.82250357, + "learning_rate": 1.6812082504637228e-06, + "loss": 0.84434199, + "num_input_tokens_seen": 101290260, + "step": 4694, + "time_per_iteration": 2.5473644733428955 + }, + { + "auxiliary_loss_clip": 0.01161255, + "auxiliary_loss_mlp": 0.01022309, + "balance_loss_clip": 1.05277026, + "balance_loss_mlp": 1.0144887, + "epoch": 0.564540371550532, + "flos": 23258264376960.0, + "grad_norm": 1.6775129219736873, + "language_loss": 0.74346316, + "learning_rate": 1.6804392623663025e-06, + "loss": 0.76529878, + "num_input_tokens_seen": 101311465, + "step": 4695, + "time_per_iteration": 2.5016591548919678 + }, + { + "auxiliary_loss_clip": 0.01156502, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.04980719, + "balance_loss_mlp": 1.01694155, + "epoch": 0.5646606144411712, + "flos": 25010058672000.0, + "grad_norm": 1.940493550732572, + "language_loss": 0.78128088, + "learning_rate": 1.6796703227459935e-06, + "loss": 0.80309623, + "num_input_tokens_seen": 101329420, + "step": 4696, + "time_per_iteration": 2.481167793273926 + }, + { + "auxiliary_loss_clip": 0.01112191, + "auxiliary_loss_mlp": 0.01024885, + "balance_loss_clip": 1.04430425, + "balance_loss_mlp": 1.01689839, + "epoch": 0.5647808573318103, + "flos": 36539645806080.0, + "grad_norm": 1.7537009572840152, + "language_loss": 0.7605378, + "learning_rate": 1.6789014317194407e-06, + "loss": 0.78190857, + "num_input_tokens_seen": 101350900, + "step": 4697, + "time_per_iteration": 2.7260093688964844 + }, + { + "auxiliary_loss_clip": 0.01159502, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.05382454, + "balance_loss_mlp": 1.02039647, + "epoch": 0.5649011002224493, + "flos": 22528451842560.0, + "grad_norm": 2.605412069969427, + "language_loss": 0.72936368, + "learning_rate": 1.6781325894032853e-06, + "loss": 0.75124681, + "num_input_tokens_seen": 101369860, + "step": 4698, + "time_per_iteration": 2.546757221221924 + }, + { + "auxiliary_loss_clip": 0.01148021, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.05302024, + "balance_loss_mlp": 1.02506351, + "epoch": 0.5650213431130885, + "flos": 18515147304960.0, + "grad_norm": 1.9566633712524366, + "language_loss": 0.92016459, + "learning_rate": 1.6773637959141608e-06, + "loss": 0.94197464, + "num_input_tokens_seen": 101386835, + "step": 4699, + "time_per_iteration": 2.4633591175079346 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01028008, + "balance_loss_clip": 1.04763365, + "balance_loss_mlp": 1.02000892, + "epoch": 0.5651415860037275, + "flos": 17526310819200.0, + "grad_norm": 2.056013661694584, + "language_loss": 0.66345894, + "learning_rate": 1.6765950513686915e-06, + "loss": 0.68513614, + "num_input_tokens_seen": 101404945, + "step": 4700, + "time_per_iteration": 2.4759459495544434 + }, + { + "auxiliary_loss_clip": 0.01121685, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.04298306, + "balance_loss_mlp": 1.0247252, + "epoch": 0.5652618288943666, + "flos": 25520026014720.0, + "grad_norm": 2.1644489644946385, + "language_loss": 0.76034021, + "learning_rate": 1.675826355883496e-06, + "loss": 0.78188813, + "num_input_tokens_seen": 101424160, + "step": 4701, + "time_per_iteration": 2.634275197982788 + }, + { + "auxiliary_loss_clip": 0.01144623, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.05011082, + "balance_loss_mlp": 1.02342439, + "epoch": 0.5653820717850057, + "flos": 19683105937920.0, + "grad_norm": 2.4807865185467373, + "language_loss": 0.78902924, + "learning_rate": 1.6750577095751848e-06, + "loss": 0.81079137, + "num_input_tokens_seen": 101443270, + "step": 4702, + "time_per_iteration": 2.4767277240753174 + }, + { + "auxiliary_loss_clip": 0.01175368, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.05147231, + "balance_loss_mlp": 1.02386153, + "epoch": 0.5655023146756448, + "flos": 26979722910720.0, + "grad_norm": 1.7288361627247184, + "language_loss": 0.7252205, + "learning_rate": 1.6742891125603605e-06, + "loss": 0.74729097, + "num_input_tokens_seen": 101464175, + "step": 4703, + "time_per_iteration": 2.464181661605835 + }, + { + "auxiliary_loss_clip": 0.01161838, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.05123281, + "balance_loss_mlp": 1.01860094, + "epoch": 0.5656225575662839, + "flos": 27669351104640.0, + "grad_norm": 1.8079627743755122, + "language_loss": 0.72152948, + "learning_rate": 1.6735205649556185e-06, + "loss": 0.74341804, + "num_input_tokens_seen": 101484045, + "step": 4704, + "time_per_iteration": 2.515594959259033 + }, + { + "auxiliary_loss_clip": 0.01138262, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.0479877, + "balance_loss_mlp": 1.02154326, + "epoch": 0.5657428004569229, + "flos": 24349732997760.0, + "grad_norm": 1.6270290300969787, + "language_loss": 0.84739369, + "learning_rate": 1.6727520668775476e-06, + "loss": 0.86907089, + "num_input_tokens_seen": 101504330, + "step": 4705, + "time_per_iteration": 2.556506633758545 + }, + { + "auxiliary_loss_clip": 0.01179483, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.05179179, + "balance_loss_mlp": 1.02099538, + "epoch": 0.5658630433475621, + "flos": 21944041562880.0, + "grad_norm": 1.5995932013267553, + "language_loss": 0.7536822, + "learning_rate": 1.6719836184427275e-06, + "loss": 0.77577138, + "num_input_tokens_seen": 101524635, + "step": 4706, + "time_per_iteration": 2.4360644817352295 + }, + { + "auxiliary_loss_clip": 0.0114712, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.04825616, + "balance_loss_mlp": 1.01923943, + "epoch": 0.5659832862382012, + "flos": 30409012218240.0, + "grad_norm": 2.0394749216234076, + "language_loss": 0.64126742, + "learning_rate": 1.671215219767733e-06, + "loss": 0.66300452, + "num_input_tokens_seen": 101544095, + "step": 4707, + "time_per_iteration": 2.5578556060791016 + }, + { + "auxiliary_loss_clip": 0.01123661, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.04695809, + "balance_loss_mlp": 1.02317405, + "epoch": 0.5661035291288402, + "flos": 13188194570880.0, + "grad_norm": 2.291189978795589, + "language_loss": 0.7618767, + "learning_rate": 1.670446870969127e-06, + "loss": 0.78342664, + "num_input_tokens_seen": 101561760, + "step": 4708, + "time_per_iteration": 2.5221760272979736 + }, + { + "auxiliary_loss_clip": 0.0115383, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.05117726, + "balance_loss_mlp": 1.02042902, + "epoch": 0.5662237720194794, + "flos": 16143032108160.0, + "grad_norm": 2.1385840302027335, + "language_loss": 0.8010447, + "learning_rate": 1.6696785721634685e-06, + "loss": 0.82286561, + "num_input_tokens_seen": 101576245, + "step": 4709, + "time_per_iteration": 3.2841742038726807 + }, + { + "auxiliary_loss_clip": 0.01164776, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.0502218, + "balance_loss_mlp": 1.02562487, + "epoch": 0.5663440149101184, + "flos": 17676848718720.0, + "grad_norm": 2.0248618267950516, + "language_loss": 0.73495239, + "learning_rate": 1.6689103234673086e-06, + "loss": 0.75693852, + "num_input_tokens_seen": 101594565, + "step": 4710, + "time_per_iteration": 2.4507322311401367 + }, + { + "auxiliary_loss_clip": 0.01148322, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.05052042, + "balance_loss_mlp": 1.0216713, + "epoch": 0.5664642578007575, + "flos": 23368330627200.0, + "grad_norm": 4.111201650649018, + "language_loss": 0.76954913, + "learning_rate": 1.668142124997189e-06, + "loss": 0.79132938, + "num_input_tokens_seen": 101614225, + "step": 4711, + "time_per_iteration": 2.5090441703796387 + }, + { + "auxiliary_loss_clip": 0.01048453, + "auxiliary_loss_mlp": 0.01004458, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.0035342, + "epoch": 0.5665845006913967, + "flos": 65516470945920.0, + "grad_norm": 0.7247073504934648, + "language_loss": 0.59846222, + "learning_rate": 1.6673739768696453e-06, + "loss": 0.61899137, + "num_input_tokens_seen": 101680795, + "step": 4712, + "time_per_iteration": 3.0884652137756348 + }, + { + "auxiliary_loss_clip": 0.01156187, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.04947615, + "balance_loss_mlp": 1.020105, + "epoch": 0.5667047435820357, + "flos": 26140885620480.0, + "grad_norm": 2.56243291798559, + "language_loss": 0.77386022, + "learning_rate": 1.6666058792012052e-06, + "loss": 0.79570925, + "num_input_tokens_seen": 101701680, + "step": 4713, + "time_per_iteration": 3.3893685340881348 + }, + { + "auxiliary_loss_clip": 0.01065614, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.01392603, + "balance_loss_mlp": 1.00088787, + "epoch": 0.5668249864726748, + "flos": 71866949725440.0, + "grad_norm": 0.881443577343635, + "language_loss": 0.68800932, + "learning_rate": 1.6658378321083878e-06, + "loss": 0.70868433, + "num_input_tokens_seen": 101766010, + "step": 4714, + "time_per_iteration": 3.082977056503296 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01025905, + "balance_loss_clip": 1.04485428, + "balance_loss_mlp": 1.01834989, + "epoch": 0.5669452293633139, + "flos": 22195667312640.0, + "grad_norm": 2.310161133539983, + "language_loss": 0.82514113, + "learning_rate": 1.6650698357077055e-06, + "loss": 0.84651697, + "num_input_tokens_seen": 101783055, + "step": 4715, + "time_per_iteration": 2.621511459350586 + }, + { + "auxiliary_loss_clip": 0.01154776, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.04928899, + "balance_loss_mlp": 1.02381158, + "epoch": 0.567065472253953, + "flos": 18223193560320.0, + "grad_norm": 2.923296883424349, + "language_loss": 0.80919373, + "learning_rate": 1.6643018901156632e-06, + "loss": 0.83106494, + "num_input_tokens_seen": 101802150, + "step": 4716, + "time_per_iteration": 3.3085148334503174 + }, + { + "auxiliary_loss_clip": 0.01155781, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.05090141, + "balance_loss_mlp": 1.0188309, + "epoch": 0.567185715144592, + "flos": 20371548983040.0, + "grad_norm": 3.0750420763824127, + "language_loss": 0.79597688, + "learning_rate": 1.6635339954487566e-06, + "loss": 0.81780124, + "num_input_tokens_seen": 101818025, + "step": 4717, + "time_per_iteration": 2.4891278743743896 + }, + { + "auxiliary_loss_clip": 0.01154342, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.04996347, + "balance_loss_mlp": 1.01880467, + "epoch": 0.5673059580352312, + "flos": 23221348174080.0, + "grad_norm": 1.815736403216912, + "language_loss": 0.82125729, + "learning_rate": 1.6627661518234765e-06, + "loss": 0.84306753, + "num_input_tokens_seen": 101837280, + "step": 4718, + "time_per_iteration": 3.285160779953003 + }, + { + "auxiliary_loss_clip": 0.01126167, + "auxiliary_loss_mlp": 0.0102668, + "balance_loss_clip": 1.04962659, + "balance_loss_mlp": 1.01835918, + "epoch": 0.5674262009258703, + "flos": 21719599430400.0, + "grad_norm": 1.5566206295639413, + "language_loss": 0.85178322, + "learning_rate": 1.661998359356302e-06, + "loss": 0.87331164, + "num_input_tokens_seen": 101856310, + "step": 4719, + "time_per_iteration": 2.6344470977783203 + }, + { + "auxiliary_loss_clip": 0.01074504, + "auxiliary_loss_mlp": 0.01003021, + "balance_loss_clip": 1.01428938, + "balance_loss_mlp": 1.00207376, + "epoch": 0.5675464438165093, + "flos": 67470369114240.0, + "grad_norm": 0.7465739752672518, + "language_loss": 0.55847228, + "learning_rate": 1.6612306181637077e-06, + "loss": 0.57924747, + "num_input_tokens_seen": 101915635, + "step": 4720, + "time_per_iteration": 2.9846572875976562 + }, + { + "auxiliary_loss_clip": 0.01132208, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.0465306, + "balance_loss_mlp": 1.02393615, + "epoch": 0.5676666867071485, + "flos": 18879173688960.0, + "grad_norm": 2.850211380728475, + "language_loss": 0.65545309, + "learning_rate": 1.6604629283621598e-06, + "loss": 0.67709291, + "num_input_tokens_seen": 101933565, + "step": 4721, + "time_per_iteration": 2.5548489093780518 + }, + { + "auxiliary_loss_clip": 0.0118074, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.05319858, + "balance_loss_mlp": 1.02409911, + "epoch": 0.5677869295977875, + "flos": 33546778744320.0, + "grad_norm": 1.8710218947673942, + "language_loss": 0.74162489, + "learning_rate": 1.6596952900681152e-06, + "loss": 0.76375926, + "num_input_tokens_seen": 101954325, + "step": 4722, + "time_per_iteration": 2.5515873432159424 + }, + { + "auxiliary_loss_clip": 0.01120108, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.053478, + "balance_loss_mlp": 1.02299154, + "epoch": 0.5679071724884266, + "flos": 28037256157440.0, + "grad_norm": 2.162052294578075, + "language_loss": 0.81846702, + "learning_rate": 1.658927703398025e-06, + "loss": 0.83998537, + "num_input_tokens_seen": 101974390, + "step": 4723, + "time_per_iteration": 2.6018714904785156 + }, + { + "auxiliary_loss_clip": 0.01119953, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.04188263, + "balance_loss_mlp": 1.01801205, + "epoch": 0.5680274153790658, + "flos": 23550110380800.0, + "grad_norm": 2.5546751487288164, + "language_loss": 0.7823928, + "learning_rate": 1.6581601684683309e-06, + "loss": 0.80385315, + "num_input_tokens_seen": 101994815, + "step": 4724, + "time_per_iteration": 2.5922391414642334 + }, + { + "auxiliary_loss_clip": 0.0116453, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.05205464, + "balance_loss_mlp": 1.02184284, + "epoch": 0.5681476582697048, + "flos": 22455158140800.0, + "grad_norm": 2.6734174527055248, + "language_loss": 0.68262964, + "learning_rate": 1.6573926853954674e-06, + "loss": 0.70456648, + "num_input_tokens_seen": 102012400, + "step": 4725, + "time_per_iteration": 2.4544126987457275 + }, + { + "auxiliary_loss_clip": 0.01141419, + "auxiliary_loss_mlp": 0.01025158, + "balance_loss_clip": 1.0447166, + "balance_loss_mlp": 1.01714468, + "epoch": 0.5682679011603439, + "flos": 19536913584000.0, + "grad_norm": 1.8993748613005417, + "language_loss": 0.8323307, + "learning_rate": 1.6566252542958608e-06, + "loss": 0.85399646, + "num_input_tokens_seen": 102031900, + "step": 4726, + "time_per_iteration": 2.5696568489074707 + }, + { + "auxiliary_loss_clip": 0.01131414, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.04947686, + "balance_loss_mlp": 1.02264643, + "epoch": 0.568388144050983, + "flos": 28765488493440.0, + "grad_norm": 1.9272812791611291, + "language_loss": 0.78774786, + "learning_rate": 1.6558578752859305e-06, + "loss": 0.80936885, + "num_input_tokens_seen": 102050860, + "step": 4727, + "time_per_iteration": 2.585646152496338 + }, + { + "auxiliary_loss_clip": 0.01134925, + "auxiliary_loss_mlp": 0.01025424, + "balance_loss_clip": 1.04784811, + "balance_loss_mlp": 1.01809299, + "epoch": 0.5685083869416221, + "flos": 21209452519680.0, + "grad_norm": 3.4885767045964595, + "language_loss": 0.78853911, + "learning_rate": 1.6550905484820865e-06, + "loss": 0.81014264, + "num_input_tokens_seen": 102069320, + "step": 4728, + "time_per_iteration": 2.530128002166748 + }, + { + "auxiliary_loss_clip": 0.01180409, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.05264819, + "balance_loss_mlp": 1.02021992, + "epoch": 0.5686286298322611, + "flos": 24827021942400.0, + "grad_norm": 2.1654557125574616, + "language_loss": 0.79086977, + "learning_rate": 1.6543232740007328e-06, + "loss": 0.81295943, + "num_input_tokens_seen": 102086435, + "step": 4729, + "time_per_iteration": 2.4443626403808594 + }, + { + "auxiliary_loss_clip": 0.01167989, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.05272341, + "balance_loss_mlp": 1.0197109, + "epoch": 0.5687488727229003, + "flos": 26615121909120.0, + "grad_norm": 2.5494685832169597, + "language_loss": 0.67172599, + "learning_rate": 1.653556051958263e-06, + "loss": 0.69368404, + "num_input_tokens_seen": 102106115, + "step": 4730, + "time_per_iteration": 2.507558584213257 + }, + { + "auxiliary_loss_clip": 0.0108906, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.04206955, + "balance_loss_mlp": 1.01878262, + "epoch": 0.5688691156135394, + "flos": 20808725414400.0, + "grad_norm": 2.099238030279477, + "language_loss": 0.74115926, + "learning_rate": 1.6527888824710642e-06, + "loss": 0.76231939, + "num_input_tokens_seen": 102125715, + "step": 4731, + "time_per_iteration": 2.610666036605835 + }, + { + "auxiliary_loss_clip": 0.01128763, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.04457283, + "balance_loss_mlp": 1.02127421, + "epoch": 0.5689893585041784, + "flos": 25880963829120.0, + "grad_norm": 2.0808562393608097, + "language_loss": 0.76409227, + "learning_rate": 1.6520217656555166e-06, + "loss": 0.78567767, + "num_input_tokens_seen": 102145005, + "step": 4732, + "time_per_iteration": 2.553199529647827 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.0479393, + "balance_loss_mlp": 1.02349174, + "epoch": 0.5691096013948175, + "flos": 23477463123840.0, + "grad_norm": 1.4975347240184178, + "language_loss": 0.71006942, + "learning_rate": 1.65125470162799e-06, + "loss": 0.73176897, + "num_input_tokens_seen": 102165360, + "step": 4733, + "time_per_iteration": 2.511319398880005 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.04484749, + "balance_loss_mlp": 1.02233577, + "epoch": 0.5692298442854566, + "flos": 18075600576000.0, + "grad_norm": 1.9645453373813944, + "language_loss": 0.69870341, + "learning_rate": 1.6504876905048485e-06, + "loss": 0.72036344, + "num_input_tokens_seen": 102182320, + "step": 4734, + "time_per_iteration": 2.519029140472412 + }, + { + "auxiliary_loss_clip": 0.01178031, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.05421352, + "balance_loss_mlp": 1.01810122, + "epoch": 0.5693500871760957, + "flos": 23039317025280.0, + "grad_norm": 1.855190993853517, + "language_loss": 0.72103029, + "learning_rate": 1.6497207324024464e-06, + "loss": 0.74306709, + "num_input_tokens_seen": 102201220, + "step": 4735, + "time_per_iteration": 2.4509618282318115 + }, + { + "auxiliary_loss_clip": 0.01157478, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.04972386, + "balance_loss_mlp": 1.02180362, + "epoch": 0.5694703300667348, + "flos": 18989670902400.0, + "grad_norm": 13.772233856028635, + "language_loss": 0.82644027, + "learning_rate": 1.6489538274371305e-06, + "loss": 0.84831274, + "num_input_tokens_seen": 102219825, + "step": 4736, + "time_per_iteration": 3.3168396949768066 + }, + { + "auxiliary_loss_clip": 0.01158419, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.05204082, + "balance_loss_mlp": 1.02127445, + "epoch": 0.5695905729573739, + "flos": 21908705558400.0, + "grad_norm": 1.871675628880176, + "language_loss": 0.83240873, + "learning_rate": 1.6481869757252396e-06, + "loss": 0.85428166, + "num_input_tokens_seen": 102238160, + "step": 4737, + "time_per_iteration": 2.4798386096954346 + }, + { + "auxiliary_loss_clip": 0.01165729, + "auxiliary_loss_mlp": 0.01030887, + "balance_loss_clip": 1.05389452, + "balance_loss_mlp": 1.02334714, + "epoch": 0.569710815848013, + "flos": 28476659232000.0, + "grad_norm": 1.470773345299097, + "language_loss": 0.71815145, + "learning_rate": 1.647420177383105e-06, + "loss": 0.74011767, + "num_input_tokens_seen": 102261030, + "step": 4738, + "time_per_iteration": 2.5471749305725098 + }, + { + "auxiliary_loss_clip": 0.01160067, + "auxiliary_loss_mlp": 0.01024843, + "balance_loss_clip": 1.05376577, + "balance_loss_mlp": 1.01752019, + "epoch": 0.569831058738652, + "flos": 28366162018560.0, + "grad_norm": 1.804050814597485, + "language_loss": 0.72515523, + "learning_rate": 1.646653432527049e-06, + "loss": 0.74700433, + "num_input_tokens_seen": 102281670, + "step": 4739, + "time_per_iteration": 3.3799397945404053 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.05113959, + "balance_loss_mlp": 1.01740456, + "epoch": 0.5699513016292912, + "flos": 25849973370240.0, + "grad_norm": 1.5147571984855663, + "language_loss": 0.74583161, + "learning_rate": 1.645886741273387e-06, + "loss": 0.76747358, + "num_input_tokens_seen": 102303485, + "step": 4740, + "time_per_iteration": 2.582517623901367 + }, + { + "auxiliary_loss_clip": 0.01134086, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.05308735, + "balance_loss_mlp": 1.0250783, + "epoch": 0.5700715445199303, + "flos": 18037858360320.0, + "grad_norm": 1.8819933248917702, + "language_loss": 0.7388643, + "learning_rate": 1.645120103738424e-06, + "loss": 0.76053715, + "num_input_tokens_seen": 102320995, + "step": 4741, + "time_per_iteration": 2.5203750133514404 + }, + { + "auxiliary_loss_clip": 0.01151465, + "auxiliary_loss_mlp": 0.00762225, + "balance_loss_clip": 1.04902637, + "balance_loss_mlp": 1.00041485, + "epoch": 0.5701917874105693, + "flos": 11473352392320.0, + "grad_norm": 2.1662987553458968, + "language_loss": 0.83098698, + "learning_rate": 1.6443535200384591e-06, + "loss": 0.85012382, + "num_input_tokens_seen": 102339170, + "step": 4742, + "time_per_iteration": 3.3173928260803223 + }, + { + "auxiliary_loss_clip": 0.01180091, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.05387616, + "balance_loss_mlp": 1.02391887, + "epoch": 0.5703120303012085, + "flos": 21761759018880.0, + "grad_norm": 1.7339324478765894, + "language_loss": 0.70340449, + "learning_rate": 1.6435869902897827e-06, + "loss": 0.72552609, + "num_input_tokens_seen": 102357750, + "step": 4743, + "time_per_iteration": 2.4915976524353027 + }, + { + "auxiliary_loss_clip": 0.01042878, + "auxiliary_loss_mlp": 0.01002232, + "balance_loss_clip": 1.01622462, + "balance_loss_mlp": 1.00115955, + "epoch": 0.5704322731918475, + "flos": 56746258513920.0, + "grad_norm": 0.7916318192070402, + "language_loss": 0.62012851, + "learning_rate": 1.6428205146086764e-06, + "loss": 0.6405797, + "num_input_tokens_seen": 102419730, + "step": 4744, + "time_per_iteration": 3.156371593475342 + }, + { + "auxiliary_loss_clip": 0.01155329, + "auxiliary_loss_mlp": 0.01027641, + "balance_loss_clip": 1.04986286, + "balance_loss_mlp": 1.01962423, + "epoch": 0.5705525160824866, + "flos": 20741141975040.0, + "grad_norm": 1.5881622900639671, + "language_loss": 0.70522523, + "learning_rate": 1.6420540931114142e-06, + "loss": 0.72705495, + "num_input_tokens_seen": 102440320, + "step": 4745, + "time_per_iteration": 3.334642171859741 + }, + { + "auxiliary_loss_clip": 0.01150573, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.0490464, + "balance_loss_mlp": 1.0315944, + "epoch": 0.5706727589731257, + "flos": 18771262254720.0, + "grad_norm": 2.9952781073061967, + "language_loss": 0.78852087, + "learning_rate": 1.6412877259142616e-06, + "loss": 0.81041968, + "num_input_tokens_seen": 102460240, + "step": 4746, + "time_per_iteration": 2.5048701763153076 + }, + { + "auxiliary_loss_clip": 0.01147025, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.05052066, + "balance_loss_mlp": 1.0187099, + "epoch": 0.5707930018637648, + "flos": 27634733372160.0, + "grad_norm": 3.0466808057231773, + "language_loss": 0.73664892, + "learning_rate": 1.6405214131334757e-06, + "loss": 0.75838459, + "num_input_tokens_seen": 102478765, + "step": 4747, + "time_per_iteration": 2.5273215770721436 + }, + { + "auxiliary_loss_clip": 0.01119149, + "auxiliary_loss_mlp": 0.0102502, + "balance_loss_clip": 1.04923499, + "balance_loss_mlp": 1.01717043, + "epoch": 0.5709132447544039, + "flos": 27597673514880.0, + "grad_norm": 2.1510479692751954, + "language_loss": 0.79685748, + "learning_rate": 1.6397551548853052e-06, + "loss": 0.81829917, + "num_input_tokens_seen": 102496930, + "step": 4748, + "time_per_iteration": 2.60506534576416 + }, + { + "auxiliary_loss_clip": 0.01150623, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.05115271, + "balance_loss_mlp": 1.01989698, + "epoch": 0.571033487645043, + "flos": 21686095019520.0, + "grad_norm": 1.707413932690761, + "language_loss": 0.70669395, + "learning_rate": 1.6389889512859917e-06, + "loss": 0.72847736, + "num_input_tokens_seen": 102516590, + "step": 4749, + "time_per_iteration": 2.495048999786377 + }, + { + "auxiliary_loss_clip": 0.01051427, + "auxiliary_loss_mlp": 0.01001945, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.00094342, + "epoch": 0.5711537305356821, + "flos": 70181445980160.0, + "grad_norm": 1.5239875771762392, + "language_loss": 0.60341245, + "learning_rate": 1.638222802451767e-06, + "loss": 0.62394613, + "num_input_tokens_seen": 102578070, + "step": 4750, + "time_per_iteration": 3.078207015991211 + }, + { + "auxiliary_loss_clip": 0.01159788, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.05223393, + "balance_loss_mlp": 1.01755369, + "epoch": 0.5712739734263211, + "flos": 24717494396160.0, + "grad_norm": 3.5052093593161797, + "language_loss": 0.75152946, + "learning_rate": 1.6374567084988561e-06, + "loss": 0.77337784, + "num_input_tokens_seen": 102599255, + "step": 4751, + "time_per_iteration": 2.490955114364624 + }, + { + "auxiliary_loss_clip": 0.01156749, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.05352747, + "balance_loss_mlp": 1.01873159, + "epoch": 0.5713942163169603, + "flos": 26578169792640.0, + "grad_norm": 1.9093210784622567, + "language_loss": 0.76478082, + "learning_rate": 1.6366906695434738e-06, + "loss": 0.78662395, + "num_input_tokens_seen": 102621775, + "step": 4752, + "time_per_iteration": 2.5401628017425537 + }, + { + "auxiliary_loss_clip": 0.01167696, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.0564239, + "balance_loss_mlp": 1.02113795, + "epoch": 0.5715144592075994, + "flos": 21142443697920.0, + "grad_norm": 2.0891406145388656, + "language_loss": 0.86240387, + "learning_rate": 1.6359246857018275e-06, + "loss": 0.88436872, + "num_input_tokens_seen": 102639305, + "step": 4753, + "time_per_iteration": 2.456295967102051 + }, + { + "auxiliary_loss_clip": 0.01121055, + "auxiliary_loss_mlp": 0.01024696, + "balance_loss_clip": 1.04586601, + "balance_loss_mlp": 1.01672101, + "epoch": 0.5716347020982384, + "flos": 23330265189120.0, + "grad_norm": 5.332455642797424, + "language_loss": 0.7806741, + "learning_rate": 1.6351587570901178e-06, + "loss": 0.80213165, + "num_input_tokens_seen": 102659430, + "step": 4754, + "time_per_iteration": 2.570239305496216 + }, + { + "auxiliary_loss_clip": 0.0113945, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.05317783, + "balance_loss_mlp": 1.02257323, + "epoch": 0.5717549449888776, + "flos": 17009555806080.0, + "grad_norm": 3.52161720218778, + "language_loss": 0.75976098, + "learning_rate": 1.634392883824534e-06, + "loss": 0.78145695, + "num_input_tokens_seen": 102671430, + "step": 4755, + "time_per_iteration": 2.4697248935699463 + }, + { + "auxiliary_loss_clip": 0.01124218, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.04681683, + "balance_loss_mlp": 1.0193162, + "epoch": 0.5718751878795166, + "flos": 35518130922240.0, + "grad_norm": 1.7099825837522165, + "language_loss": 0.67718029, + "learning_rate": 1.6336270660212595e-06, + "loss": 0.69869781, + "num_input_tokens_seen": 102693025, + "step": 4756, + "time_per_iteration": 2.679170608520508 + }, + { + "auxiliary_loss_clip": 0.01151473, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.05642188, + "balance_loss_mlp": 1.01994491, + "epoch": 0.5719954307701557, + "flos": 38613989255040.0, + "grad_norm": 2.098109128742455, + "language_loss": 0.65970272, + "learning_rate": 1.6328613037964676e-06, + "loss": 0.68150461, + "num_input_tokens_seen": 102716090, + "step": 4757, + "time_per_iteration": 2.632429599761963 + }, + { + "auxiliary_loss_clip": 0.0116407, + "auxiliary_loss_mlp": 0.01024895, + "balance_loss_clip": 1.05146694, + "balance_loss_mlp": 1.01708078, + "epoch": 0.5721156736607949, + "flos": 20631111638400.0, + "grad_norm": 2.2049568480010024, + "language_loss": 0.67957032, + "learning_rate": 1.6320955972663241e-06, + "loss": 0.70145994, + "num_input_tokens_seen": 102735685, + "step": 4758, + "time_per_iteration": 2.4628214836120605 + }, + { + "auxiliary_loss_clip": 0.01165425, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.05173123, + "balance_loss_mlp": 1.02092004, + "epoch": 0.5722359165514339, + "flos": 37415076076800.0, + "grad_norm": 3.2790342293734995, + "language_loss": 0.65478462, + "learning_rate": 1.6313299465469857e-06, + "loss": 0.67672646, + "num_input_tokens_seen": 102758415, + "step": 4759, + "time_per_iteration": 2.6060216426849365 + }, + { + "auxiliary_loss_clip": 0.0116235, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.05295944, + "balance_loss_mlp": 1.02113032, + "epoch": 0.572356159442073, + "flos": 21972877205760.0, + "grad_norm": 3.0941458673995004, + "language_loss": 0.79842687, + "learning_rate": 1.6305643517546014e-06, + "loss": 0.82034677, + "num_input_tokens_seen": 102773795, + "step": 4760, + "time_per_iteration": 2.4629385471343994 + }, + { + "auxiliary_loss_clip": 0.01178226, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.05441713, + "balance_loss_mlp": 1.02931631, + "epoch": 0.5724764023327121, + "flos": 19135540033920.0, + "grad_norm": 2.205102241371106, + "language_loss": 0.84799659, + "learning_rate": 1.629798813005311e-06, + "loss": 0.87014878, + "num_input_tokens_seen": 102793515, + "step": 4761, + "time_per_iteration": 2.442936897277832 + }, + { + "auxiliary_loss_clip": 0.01122685, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.04923034, + "balance_loss_mlp": 1.02066636, + "epoch": 0.5725966452233512, + "flos": 22819759142400.0, + "grad_norm": 2.060900104094086, + "language_loss": 0.71421975, + "learning_rate": 1.6290333304152473e-06, + "loss": 0.73572958, + "num_input_tokens_seen": 102813390, + "step": 4762, + "time_per_iteration": 3.454465866088867 + }, + { + "auxiliary_loss_clip": 0.01150804, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.05635846, + "balance_loss_mlp": 1.02433848, + "epoch": 0.5727168881139902, + "flos": 41496610498560.0, + "grad_norm": 1.739227773846657, + "language_loss": 0.56967151, + "learning_rate": 1.6282679041005314e-06, + "loss": 0.59150416, + "num_input_tokens_seen": 102838980, + "step": 4763, + "time_per_iteration": 2.65539813041687 + }, + { + "auxiliary_loss_clip": 0.01141807, + "auxiliary_loss_mlp": 0.01023589, + "balance_loss_clip": 1.04769182, + "balance_loss_mlp": 1.01583421, + "epoch": 0.5728371310046293, + "flos": 14647675985280.0, + "grad_norm": 4.273514517086232, + "language_loss": 0.87447834, + "learning_rate": 1.6275025341772789e-06, + "loss": 0.89613229, + "num_input_tokens_seen": 102855285, + "step": 4764, + "time_per_iteration": 2.451650857925415 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.04970515, + "balance_loss_mlp": 1.02058053, + "epoch": 0.5729573738952685, + "flos": 21506613736320.0, + "grad_norm": 2.603330793803001, + "language_loss": 0.81770635, + "learning_rate": 1.626737220761596e-06, + "loss": 0.83950877, + "num_input_tokens_seen": 102872750, + "step": 4765, + "time_per_iteration": 2.4955108165740967 + }, + { + "auxiliary_loss_clip": 0.01160917, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.0530231, + "balance_loss_mlp": 1.02030087, + "epoch": 0.5730776167859075, + "flos": 23621680229760.0, + "grad_norm": 2.3190990351956633, + "language_loss": 0.7847755, + "learning_rate": 1.62597196396958e-06, + "loss": 0.80666685, + "num_input_tokens_seen": 102890920, + "step": 4766, + "time_per_iteration": 3.3220200538635254 + }, + { + "auxiliary_loss_clip": 0.01167287, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.05431175, + "balance_loss_mlp": 1.01817298, + "epoch": 0.5731978596765466, + "flos": 25739224761600.0, + "grad_norm": 1.854538852857469, + "language_loss": 0.85461199, + "learning_rate": 1.6252067639173197e-06, + "loss": 0.87654787, + "num_input_tokens_seen": 102912830, + "step": 4767, + "time_per_iteration": 2.495978832244873 + }, + { + "auxiliary_loss_clip": 0.01165056, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.0521971, + "balance_loss_mlp": 1.01991653, + "epoch": 0.5733181025671857, + "flos": 26359509749760.0, + "grad_norm": 1.8074348970593332, + "language_loss": 0.69824123, + "learning_rate": 1.6244416207208956e-06, + "loss": 0.72017127, + "num_input_tokens_seen": 102933765, + "step": 4768, + "time_per_iteration": 3.297753095626831 + }, + { + "auxiliary_loss_clip": 0.01138937, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.05142665, + "balance_loss_mlp": 1.02530742, + "epoch": 0.5734383454578248, + "flos": 29423874833280.0, + "grad_norm": 1.6160593431772499, + "language_loss": 0.73711258, + "learning_rate": 1.6236765344963787e-06, + "loss": 0.75883389, + "num_input_tokens_seen": 102955025, + "step": 4769, + "time_per_iteration": 2.586174964904785 + }, + { + "auxiliary_loss_clip": 0.01151216, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.05109143, + "balance_loss_mlp": 1.01734674, + "epoch": 0.5735585883484638, + "flos": 34969954487040.0, + "grad_norm": 2.519054706805177, + "language_loss": 0.68985176, + "learning_rate": 1.6229115053598322e-06, + "loss": 0.71161532, + "num_input_tokens_seen": 102976780, + "step": 4770, + "time_per_iteration": 2.602721929550171 + }, + { + "auxiliary_loss_clip": 0.01167753, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.05519235, + "balance_loss_mlp": 1.02214599, + "epoch": 0.573678831239103, + "flos": 18770759464320.0, + "grad_norm": 1.6986385436856126, + "language_loss": 0.72035605, + "learning_rate": 1.6221465334273108e-06, + "loss": 0.74233383, + "num_input_tokens_seen": 102995990, + "step": 4771, + "time_per_iteration": 3.1759603023529053 + }, + { + "auxiliary_loss_clip": 0.01141856, + "auxiliary_loss_mlp": 0.01024853, + "balance_loss_clip": 1.04956734, + "balance_loss_mlp": 1.01685977, + "epoch": 0.5737990741297421, + "flos": 25702883176320.0, + "grad_norm": 2.1656902682139645, + "language_loss": 0.61840439, + "learning_rate": 1.6213816188148593e-06, + "loss": 0.64007151, + "num_input_tokens_seen": 103014695, + "step": 4772, + "time_per_iteration": 2.5560390949249268 + }, + { + "auxiliary_loss_clip": 0.01145857, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.0553143, + "balance_loss_mlp": 1.02075052, + "epoch": 0.5739193170203811, + "flos": 27269234530560.0, + "grad_norm": 1.758216911477341, + "language_loss": 0.77144563, + "learning_rate": 1.6206167616385162e-06, + "loss": 0.79319167, + "num_input_tokens_seen": 103035760, + "step": 4773, + "time_per_iteration": 2.569058895111084 + }, + { + "auxiliary_loss_clip": 0.0115817, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.05315411, + "balance_loss_mlp": 1.02077007, + "epoch": 0.5740395599110203, + "flos": 12239721993600.0, + "grad_norm": 3.3084818464407175, + "language_loss": 0.73481417, + "learning_rate": 1.6198519620143078e-06, + "loss": 0.75668442, + "num_input_tokens_seen": 103052915, + "step": 4774, + "time_per_iteration": 2.472527503967285 + }, + { + "auxiliary_loss_clip": 0.01142225, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.0526861, + "balance_loss_mlp": 1.02453017, + "epoch": 0.5741598028016593, + "flos": 25921399564800.0, + "grad_norm": 1.5924289218745713, + "language_loss": 0.78225183, + "learning_rate": 1.6190872200582546e-06, + "loss": 0.80399728, + "num_input_tokens_seen": 103074655, + "step": 4775, + "time_per_iteration": 2.5884151458740234 + }, + { + "auxiliary_loss_clip": 0.01146656, + "auxiliary_loss_mlp": 0.00762394, + "balance_loss_clip": 1.05057156, + "balance_loss_mlp": 1.00039613, + "epoch": 0.5742800456922984, + "flos": 19244133826560.0, + "grad_norm": 2.085643995961095, + "language_loss": 0.7782892, + "learning_rate": 1.6183225358863676e-06, + "loss": 0.79737973, + "num_input_tokens_seen": 103091550, + "step": 4776, + "time_per_iteration": 2.4914186000823975 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.04854012, + "balance_loss_mlp": 1.01994717, + "epoch": 0.5744002885829376, + "flos": 30920487932160.0, + "grad_norm": 2.43199619117625, + "language_loss": 0.71885306, + "learning_rate": 1.617557909614648e-06, + "loss": 0.74055612, + "num_input_tokens_seen": 103110985, + "step": 4777, + "time_per_iteration": 2.5706887245178223 + }, + { + "auxiliary_loss_clip": 0.01134175, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.04754949, + "balance_loss_mlp": 1.02012062, + "epoch": 0.5745205314735766, + "flos": 23840017050240.0, + "grad_norm": 2.1036283836514804, + "language_loss": 0.86193657, + "learning_rate": 1.6167933413590899e-06, + "loss": 0.88355571, + "num_input_tokens_seen": 103129890, + "step": 4778, + "time_per_iteration": 2.548201084136963 + }, + { + "auxiliary_loss_clip": 0.01163781, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.05230212, + "balance_loss_mlp": 1.02337563, + "epoch": 0.5746407743642157, + "flos": 12311902373760.0, + "grad_norm": 7.07477411183321, + "language_loss": 0.90704578, + "learning_rate": 1.6160288312356773e-06, + "loss": 0.92899609, + "num_input_tokens_seen": 103147020, + "step": 4779, + "time_per_iteration": 2.458759307861328 + }, + { + "auxiliary_loss_clip": 0.01168738, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.05215931, + "balance_loss_mlp": 1.01888895, + "epoch": 0.5747610172548548, + "flos": 24133658734080.0, + "grad_norm": 1.5443493866202302, + "language_loss": 0.81743348, + "learning_rate": 1.6152643793603857e-06, + "loss": 0.83939433, + "num_input_tokens_seen": 103167370, + "step": 4780, + "time_per_iteration": 2.4920387268066406 + }, + { + "auxiliary_loss_clip": 0.01179624, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.05525708, + "balance_loss_mlp": 1.02006555, + "epoch": 0.5748812601454939, + "flos": 25408451393280.0, + "grad_norm": 1.7335684289408109, + "language_loss": 0.87785637, + "learning_rate": 1.6144999858491815e-06, + "loss": 0.89993024, + "num_input_tokens_seen": 103186000, + "step": 4781, + "time_per_iteration": 2.4937713146209717 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.01024267, + "balance_loss_clip": 1.05169177, + "balance_loss_mlp": 1.01584518, + "epoch": 0.575001503036133, + "flos": 30624942827520.0, + "grad_norm": 1.595080321016195, + "language_loss": 0.85633057, + "learning_rate": 1.6137356508180232e-06, + "loss": 0.87813461, + "num_input_tokens_seen": 103207710, + "step": 4782, + "time_per_iteration": 2.57137131690979 + }, + { + "auxiliary_loss_clip": 0.01180839, + "auxiliary_loss_mlp": 0.00762296, + "balance_loss_clip": 1.05387294, + "balance_loss_mlp": 1.00040984, + "epoch": 0.5751217459267721, + "flos": 21726566668800.0, + "grad_norm": 2.235736534573803, + "language_loss": 0.81298852, + "learning_rate": 1.6129713743828593e-06, + "loss": 0.83241987, + "num_input_tokens_seen": 103226720, + "step": 4783, + "time_per_iteration": 2.448772430419922 + }, + { + "auxiliary_loss_clip": 0.01151621, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.04941893, + "balance_loss_mlp": 1.02030456, + "epoch": 0.5752419888174112, + "flos": 21651620941440.0, + "grad_norm": 1.523641375629479, + "language_loss": 0.75432396, + "learning_rate": 1.6122071566596306e-06, + "loss": 0.77611828, + "num_input_tokens_seen": 103246995, + "step": 4784, + "time_per_iteration": 2.5059280395507812 + }, + { + "auxiliary_loss_clip": 0.01168195, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.05327964, + "balance_loss_mlp": 1.02119637, + "epoch": 0.5753622317080502, + "flos": 17775997234560.0, + "grad_norm": 2.744317140698451, + "language_loss": 0.83043426, + "learning_rate": 1.6114429977642674e-06, + "loss": 0.85241199, + "num_input_tokens_seen": 103261500, + "step": 4785, + "time_per_iteration": 2.424600601196289 + }, + { + "auxiliary_loss_clip": 0.01166476, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.05516362, + "balance_loss_mlp": 1.01873064, + "epoch": 0.5754824745986894, + "flos": 19789616741760.0, + "grad_norm": 1.7930361664239713, + "language_loss": 0.73213071, + "learning_rate": 1.6106788978126926e-06, + "loss": 0.75405687, + "num_input_tokens_seen": 103280475, + "step": 4786, + "time_per_iteration": 2.4599227905273438 + }, + { + "auxiliary_loss_clip": 0.01118287, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.04471731, + "balance_loss_mlp": 1.02342129, + "epoch": 0.5756027174893285, + "flos": 30985665160320.0, + "grad_norm": 5.146094576227733, + "language_loss": 0.79100758, + "learning_rate": 1.6099148569208196e-06, + "loss": 0.81251246, + "num_input_tokens_seen": 103297695, + "step": 4787, + "time_per_iteration": 2.624391794204712 + }, + { + "auxiliary_loss_clip": 0.01154895, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.05490386, + "balance_loss_mlp": 1.02555287, + "epoch": 0.5757229603799675, + "flos": 28546864364160.0, + "grad_norm": 2.6128869188007537, + "language_loss": 0.62864387, + "learning_rate": 1.6091508752045523e-06, + "loss": 0.6505385, + "num_input_tokens_seen": 103318575, + "step": 4788, + "time_per_iteration": 2.5386650562286377 + }, + { + "auxiliary_loss_clip": 0.01127626, + "auxiliary_loss_mlp": 0.01025739, + "balance_loss_clip": 1.04600096, + "balance_loss_mlp": 1.01770473, + "epoch": 0.5758432032706067, + "flos": 22999024944000.0, + "grad_norm": 2.168295974326995, + "language_loss": 0.86428696, + "learning_rate": 1.608386952779787e-06, + "loss": 0.88582063, + "num_input_tokens_seen": 103337945, + "step": 4789, + "time_per_iteration": 3.3623316287994385 + }, + { + "auxiliary_loss_clip": 0.011567, + "auxiliary_loss_mlp": 0.01026606, + "balance_loss_clip": 1.05294979, + "balance_loss_mlp": 1.01910138, + "epoch": 0.5759634461612457, + "flos": 25739727552000.0, + "grad_norm": 1.6391115248064005, + "language_loss": 0.7456978, + "learning_rate": 1.6076230897624098e-06, + "loss": 0.76753086, + "num_input_tokens_seen": 103360150, + "step": 4790, + "time_per_iteration": 2.5459370613098145 + }, + { + "auxiliary_loss_clip": 0.01165915, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.05057538, + "balance_loss_mlp": 1.02169979, + "epoch": 0.5760836890518848, + "flos": 30591761639040.0, + "grad_norm": 2.297407168241599, + "language_loss": 0.77500349, + "learning_rate": 1.6068592862682974e-06, + "loss": 0.79696465, + "num_input_tokens_seen": 103378305, + "step": 4791, + "time_per_iteration": 2.5299479961395264 + }, + { + "auxiliary_loss_clip": 0.01153211, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.05213237, + "balance_loss_mlp": 1.02332222, + "epoch": 0.576203931942524, + "flos": 36538963447680.0, + "grad_norm": 1.773175782769849, + "language_loss": 0.73385715, + "learning_rate": 1.6060955424133187e-06, + "loss": 0.7557013, + "num_input_tokens_seen": 103399230, + "step": 4792, + "time_per_iteration": 3.5237619876861572 + }, + { + "auxiliary_loss_clip": 0.01165923, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.05542064, + "balance_loss_mlp": 1.01798964, + "epoch": 0.576324174833163, + "flos": 25516937445120.0, + "grad_norm": 1.743717787566893, + "language_loss": 0.89414316, + "learning_rate": 1.6053318583133332e-06, + "loss": 0.91606903, + "num_input_tokens_seen": 103420100, + "step": 4793, + "time_per_iteration": 2.5145506858825684 + }, + { + "auxiliary_loss_clip": 0.01164846, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.05392146, + "balance_loss_mlp": 1.02375805, + "epoch": 0.5764444177238021, + "flos": 25119262995840.0, + "grad_norm": 2.0376276274853775, + "language_loss": 0.75491732, + "learning_rate": 1.6045682340841907e-06, + "loss": 0.77688712, + "num_input_tokens_seen": 103439025, + "step": 4794, + "time_per_iteration": 2.5021474361419678 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.00752672, + "balance_loss_clip": 1.02720571, + "balance_loss_mlp": 0.99973947, + "epoch": 0.5765646606144411, + "flos": 62212687758720.0, + "grad_norm": 0.752540202647363, + "language_loss": 0.58022308, + "learning_rate": 1.6038046698417336e-06, + "loss": 0.59830832, + "num_input_tokens_seen": 103499920, + "step": 4795, + "time_per_iteration": 3.94370436668396 + }, + { + "auxiliary_loss_clip": 0.01164884, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.05216634, + "balance_loss_mlp": 1.01725233, + "epoch": 0.5766849035050803, + "flos": 25118760205440.0, + "grad_norm": 1.9003486803110985, + "language_loss": 0.68532211, + "learning_rate": 1.6030411657017919e-06, + "loss": 0.70722091, + "num_input_tokens_seen": 103519575, + "step": 4796, + "time_per_iteration": 2.5120344161987305 + }, + { + "auxiliary_loss_clip": 0.01156231, + "auxiliary_loss_mlp": 0.01025154, + "balance_loss_clip": 1.05059648, + "balance_loss_mlp": 1.0174408, + "epoch": 0.5768051463957193, + "flos": 15991093578240.0, + "grad_norm": 1.761590501920671, + "language_loss": 0.84264827, + "learning_rate": 1.6022777217801903e-06, + "loss": 0.86446214, + "num_input_tokens_seen": 103536530, + "step": 4797, + "time_per_iteration": 3.161177158355713 + }, + { + "auxiliary_loss_clip": 0.01136033, + "auxiliary_loss_mlp": 0.01023375, + "balance_loss_clip": 1.05145061, + "balance_loss_mlp": 1.0154984, + "epoch": 0.5769253892863584, + "flos": 22163635359360.0, + "grad_norm": 1.8800946414537691, + "language_loss": 0.73611188, + "learning_rate": 1.601514338192742e-06, + "loss": 0.75770599, + "num_input_tokens_seen": 103556460, + "step": 4798, + "time_per_iteration": 2.5353286266326904 + }, + { + "auxiliary_loss_clip": 0.01175039, + "auxiliary_loss_mlp": 0.01023674, + "balance_loss_clip": 1.05287385, + "balance_loss_mlp": 1.01635456, + "epoch": 0.5770456321769976, + "flos": 22856388036480.0, + "grad_norm": 2.114391634578596, + "language_loss": 0.71742213, + "learning_rate": 1.6007510150552514e-06, + "loss": 0.73940921, + "num_input_tokens_seen": 103574520, + "step": 4799, + "time_per_iteration": 2.4333620071411133 + }, + { + "auxiliary_loss_clip": 0.01168884, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.05176425, + "balance_loss_mlp": 1.01856422, + "epoch": 0.5771658750676366, + "flos": 46353672489600.0, + "grad_norm": 1.5141077532487481, + "language_loss": 0.62086558, + "learning_rate": 1.599987752483515e-06, + "loss": 0.64282703, + "num_input_tokens_seen": 103598965, + "step": 4800, + "time_per_iteration": 2.6748929023742676 + }, + { + "auxiliary_loss_clip": 0.01130554, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.04643154, + "balance_loss_mlp": 1.02079892, + "epoch": 0.5772861179582757, + "flos": 22159972172160.0, + "grad_norm": 1.6100844400104255, + "language_loss": 0.67872632, + "learning_rate": 1.5992245505933184e-06, + "loss": 0.70032012, + "num_input_tokens_seen": 103618665, + "step": 4801, + "time_per_iteration": 2.5230495929718018 + }, + { + "auxiliary_loss_clip": 0.01180662, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.05408692, + "balance_loss_mlp": 1.02134156, + "epoch": 0.5774063608489148, + "flos": 31248926916480.0, + "grad_norm": 2.002559568933912, + "language_loss": 0.7088384, + "learning_rate": 1.5984614095004388e-06, + "loss": 0.73093331, + "num_input_tokens_seen": 103639800, + "step": 4802, + "time_per_iteration": 2.4983036518096924 + }, + { + "auxiliary_loss_clip": 0.01158881, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.05158973, + "balance_loss_mlp": 1.02246118, + "epoch": 0.5775266037395539, + "flos": 22527123039360.0, + "grad_norm": 2.80413296871043, + "language_loss": 0.81024653, + "learning_rate": 1.5976983293206438e-06, + "loss": 0.83213758, + "num_input_tokens_seen": 103655605, + "step": 4803, + "time_per_iteration": 2.449223518371582 + }, + { + "auxiliary_loss_clip": 0.01145952, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.04780221, + "balance_loss_mlp": 1.02193069, + "epoch": 0.577646846630193, + "flos": 21068790860160.0, + "grad_norm": 1.8254404834279567, + "language_loss": 0.71511841, + "learning_rate": 1.5969353101696928e-06, + "loss": 0.73687339, + "num_input_tokens_seen": 103674045, + "step": 4804, + "time_per_iteration": 2.4780876636505127 + }, + { + "auxiliary_loss_clip": 0.01164843, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.05211675, + "balance_loss_mlp": 1.02312672, + "epoch": 0.5777670895208321, + "flos": 29714284293120.0, + "grad_norm": 1.5720162114038148, + "language_loss": 0.79401541, + "learning_rate": 1.5961723521633341e-06, + "loss": 0.81596649, + "num_input_tokens_seen": 103695285, + "step": 4805, + "time_per_iteration": 2.527066230773926 + }, + { + "auxiliary_loss_clip": 0.01145788, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.04825234, + "balance_loss_mlp": 1.02364397, + "epoch": 0.5778873324114712, + "flos": 19500428344320.0, + "grad_norm": 2.2742097060379605, + "language_loss": 0.90755296, + "learning_rate": 1.5954094554173097e-06, + "loss": 0.92932737, + "num_input_tokens_seen": 103713275, + "step": 4806, + "time_per_iteration": 2.4670984745025635 + }, + { + "auxiliary_loss_clip": 0.01156293, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.05265927, + "balance_loss_mlp": 1.01957774, + "epoch": 0.5780075753021102, + "flos": 14136846716160.0, + "grad_norm": 2.099187690766042, + "language_loss": 0.78771853, + "learning_rate": 1.5946466200473482e-06, + "loss": 0.80955154, + "num_input_tokens_seen": 103731185, + "step": 4807, + "time_per_iteration": 2.4643871784210205 + }, + { + "auxiliary_loss_clip": 0.011533, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.04941022, + "balance_loss_mlp": 1.02274859, + "epoch": 0.5781278181927494, + "flos": 15262178883840.0, + "grad_norm": 1.9045103062158144, + "language_loss": 0.83419228, + "learning_rate": 1.5938838461691723e-06, + "loss": 0.85603213, + "num_input_tokens_seen": 103748095, + "step": 4808, + "time_per_iteration": 2.4726104736328125 + }, + { + "auxiliary_loss_clip": 0.01181256, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.05584574, + "balance_loss_mlp": 1.02013552, + "epoch": 0.5782480610833884, + "flos": 16726831856640.0, + "grad_norm": 2.6893097111012914, + "language_loss": 0.82849038, + "learning_rate": 1.593121133898494e-06, + "loss": 0.85058105, + "num_input_tokens_seen": 103765300, + "step": 4809, + "time_per_iteration": 2.404103994369507 + }, + { + "auxiliary_loss_clip": 0.01170514, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.05332327, + "balance_loss_mlp": 1.01931202, + "epoch": 0.5783683039740275, + "flos": 25482140144640.0, + "grad_norm": 2.3154858738439033, + "language_loss": 0.79078865, + "learning_rate": 1.592358483351016e-06, + "loss": 0.8127619, + "num_input_tokens_seen": 103785475, + "step": 4810, + "time_per_iteration": 2.4880356788635254 + }, + { + "auxiliary_loss_clip": 0.01159595, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.05054951, + "balance_loss_mlp": 1.01746881, + "epoch": 0.5784885468646667, + "flos": 18405835240320.0, + "grad_norm": 1.9558081568064312, + "language_loss": 0.72437102, + "learning_rate": 1.5915958946424326e-06, + "loss": 0.74621248, + "num_input_tokens_seen": 103804160, + "step": 4811, + "time_per_iteration": 2.433809518814087 + }, + { + "auxiliary_loss_clip": 0.01138734, + "auxiliary_loss_mlp": 0.00763159, + "balance_loss_clip": 1.05050564, + "balance_loss_mlp": 1.00041556, + "epoch": 0.5786087897553057, + "flos": 46100717936640.0, + "grad_norm": 1.6131716401641405, + "language_loss": 0.74637413, + "learning_rate": 1.5908333678884271e-06, + "loss": 0.76539308, + "num_input_tokens_seen": 103830580, + "step": 4812, + "time_per_iteration": 2.761343002319336 + }, + { + "auxiliary_loss_clip": 0.01162204, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.05179417, + "balance_loss_mlp": 1.02013421, + "epoch": 0.5787290326459448, + "flos": 12385950261120.0, + "grad_norm": 2.60731347072189, + "language_loss": 0.73593378, + "learning_rate": 1.5900709032046743e-06, + "loss": 0.75783324, + "num_input_tokens_seen": 103848655, + "step": 4813, + "time_per_iteration": 2.429922580718994 + }, + { + "auxiliary_loss_clip": 0.01147975, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.0530237, + "balance_loss_mlp": 1.01994109, + "epoch": 0.5788492755365839, + "flos": 23290332243840.0, + "grad_norm": 2.2008917073525547, + "language_loss": 0.78439766, + "learning_rate": 1.5893085007068391e-06, + "loss": 0.80615461, + "num_input_tokens_seen": 103866215, + "step": 4814, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138574, + "auxiliary_loss_mlp": 0.01025892, + "balance_loss_clip": 1.04767525, + "balance_loss_mlp": 1.01736546, + "epoch": 0.578969518427223, + "flos": 24061047390720.0, + "grad_norm": 1.7970503846266523, + "language_loss": 0.70889968, + "learning_rate": 1.5885461605105786e-06, + "loss": 0.73054433, + "num_input_tokens_seen": 103887815, + "step": 4815, + "time_per_iteration": 2.5430924892425537 + }, + { + "auxiliary_loss_clip": 0.01154508, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.0535754, + "balance_loss_mlp": 1.02012205, + "epoch": 0.579089761317862, + "flos": 21871825269120.0, + "grad_norm": 1.792600951521911, + "language_loss": 0.76843095, + "learning_rate": 1.5877838827315375e-06, + "loss": 0.79026026, + "num_input_tokens_seen": 103906360, + "step": 4816, + "time_per_iteration": 3.320629358291626 + }, + { + "auxiliary_loss_clip": 0.01178697, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.05510771, + "balance_loss_mlp": 1.02273405, + "epoch": 0.5792100042085012, + "flos": 22929681738240.0, + "grad_norm": 1.6887431972782252, + "language_loss": 0.70308185, + "learning_rate": 1.587021667485355e-06, + "loss": 0.72517234, + "num_input_tokens_seen": 103925730, + "step": 4817, + "time_per_iteration": 2.450834035873413 + }, + { + "auxiliary_loss_clip": 0.01150962, + "auxiliary_loss_mlp": 0.01022438, + "balance_loss_clip": 1.04798841, + "balance_loss_mlp": 1.0148263, + "epoch": 0.5793302470991403, + "flos": 21470056669440.0, + "grad_norm": 1.7952666705475375, + "language_loss": 0.78592384, + "learning_rate": 1.5862595148876559e-06, + "loss": 0.80765784, + "num_input_tokens_seen": 103945835, + "step": 4818, + "time_per_iteration": 2.5076398849487305 + }, + { + "auxiliary_loss_clip": 0.01125559, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.0495286, + "balance_loss_mlp": 1.02084947, + "epoch": 0.5794504899897793, + "flos": 12711013367040.0, + "grad_norm": 2.2706713707006374, + "language_loss": 0.76332724, + "learning_rate": 1.58549742505406e-06, + "loss": 0.78487033, + "num_input_tokens_seen": 103960580, + "step": 4819, + "time_per_iteration": 3.3719420433044434 + }, + { + "auxiliary_loss_clip": 0.01178578, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.05333555, + "balance_loss_mlp": 1.01927006, + "epoch": 0.5795707328804185, + "flos": 14867054300160.0, + "grad_norm": 2.5759263894171474, + "language_loss": 0.75158703, + "learning_rate": 1.5847353981001747e-06, + "loss": 0.77364022, + "num_input_tokens_seen": 103977760, + "step": 4820, + "time_per_iteration": 2.443037271499634 + }, + { + "auxiliary_loss_clip": 0.0114201, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.04654431, + "balance_loss_mlp": 1.02160025, + "epoch": 0.5796909757710575, + "flos": 36430046432640.0, + "grad_norm": 1.7062196958755085, + "language_loss": 0.69907612, + "learning_rate": 1.5839734341415993e-06, + "loss": 0.72079271, + "num_input_tokens_seen": 103999960, + "step": 4821, + "time_per_iteration": 3.46303653717041 + }, + { + "auxiliary_loss_clip": 0.01158311, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.05479527, + "balance_loss_mlp": 1.019418, + "epoch": 0.5798112186616966, + "flos": 23039891642880.0, + "grad_norm": 7.62861136487302, + "language_loss": 0.76747185, + "learning_rate": 1.5832115332939238e-06, + "loss": 0.78932238, + "num_input_tokens_seen": 104018400, + "step": 4822, + "time_per_iteration": 2.4939842224121094 + }, + { + "auxiliary_loss_clip": 0.01165624, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.05331111, + "balance_loss_mlp": 1.0240159, + "epoch": 0.5799314615523358, + "flos": 16652604401280.0, + "grad_norm": 2.0397647987742, + "language_loss": 0.74556226, + "learning_rate": 1.5824496956727272e-06, + "loss": 0.76753575, + "num_input_tokens_seen": 104035605, + "step": 4823, + "time_per_iteration": 2.455172538757324 + }, + { + "auxiliary_loss_clip": 0.01150368, + "auxiliary_loss_mlp": 0.01025473, + "balance_loss_clip": 1.05048776, + "balance_loss_mlp": 1.01774251, + "epoch": 0.5800517044429748, + "flos": 20485673470080.0, + "grad_norm": 1.6543905942696686, + "language_loss": 0.73162234, + "learning_rate": 1.5816879213935797e-06, + "loss": 0.75338078, + "num_input_tokens_seen": 104054415, + "step": 4824, + "time_per_iteration": 3.2578377723693848 + }, + { + "auxiliary_loss_clip": 0.01159574, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.05142653, + "balance_loss_mlp": 1.0224911, + "epoch": 0.5801719473336139, + "flos": 31538258968320.0, + "grad_norm": 1.6112439783685095, + "language_loss": 0.79972327, + "learning_rate": 1.5809262105720416e-06, + "loss": 0.82161665, + "num_input_tokens_seen": 104075455, + "step": 4825, + "time_per_iteration": 2.543463945388794 + }, + { + "auxiliary_loss_clip": 0.01175541, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.05400872, + "balance_loss_mlp": 1.02417207, + "epoch": 0.580292190224253, + "flos": 20375966355840.0, + "grad_norm": 1.508661381725362, + "language_loss": 0.79546809, + "learning_rate": 1.5801645633236644e-06, + "loss": 0.81753707, + "num_input_tokens_seen": 104096440, + "step": 4826, + "time_per_iteration": 2.461289167404175 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.04756069, + "balance_loss_mlp": 1.02289271, + "epoch": 0.5804124331148921, + "flos": 26615373304320.0, + "grad_norm": 1.9444833460267235, + "language_loss": 0.77387327, + "learning_rate": 1.579402979763989e-06, + "loss": 0.79560876, + "num_input_tokens_seen": 104116775, + "step": 4827, + "time_per_iteration": 2.5271782875061035 + }, + { + "auxiliary_loss_clip": 0.01121406, + "auxiliary_loss_mlp": 0.01024271, + "balance_loss_clip": 1.05055141, + "balance_loss_mlp": 1.0168587, + "epoch": 0.5805326760055312, + "flos": 13478496289920.0, + "grad_norm": 2.1982238170008617, + "language_loss": 0.81359017, + "learning_rate": 1.578641460008548e-06, + "loss": 0.83504689, + "num_input_tokens_seen": 104134510, + "step": 4828, + "time_per_iteration": 2.5493481159210205 + }, + { + "auxiliary_loss_clip": 0.01162447, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.05247688, + "balance_loss_mlp": 1.01814032, + "epoch": 0.5806529188961702, + "flos": 12091374823680.0, + "grad_norm": 2.1794946307205882, + "language_loss": 0.67927456, + "learning_rate": 1.5778800041728613e-06, + "loss": 0.70116264, + "num_input_tokens_seen": 104150800, + "step": 4829, + "time_per_iteration": 2.423433303833008 + }, + { + "auxiliary_loss_clip": 0.01159302, + "auxiliary_loss_mlp": 0.01021144, + "balance_loss_clip": 1.05304146, + "balance_loss_mlp": 1.01371145, + "epoch": 0.5807731617868094, + "flos": 26214107495040.0, + "grad_norm": 1.4959524687369703, + "language_loss": 0.66090465, + "learning_rate": 1.577118612372443e-06, + "loss": 0.6827091, + "num_input_tokens_seen": 104172640, + "step": 4830, + "time_per_iteration": 2.5437004566192627 + }, + { + "auxiliary_loss_clip": 0.01142239, + "auxiliary_loss_mlp": 0.00762631, + "balance_loss_clip": 1.04601967, + "balance_loss_mlp": 1.0005002, + "epoch": 0.5808934046774484, + "flos": 37962139190400.0, + "grad_norm": 1.6426443878281551, + "language_loss": 0.70309186, + "learning_rate": 1.5763572847227943e-06, + "loss": 0.72214061, + "num_input_tokens_seen": 104193525, + "step": 4831, + "time_per_iteration": 2.665447950363159 + }, + { + "auxiliary_loss_clip": 0.01160254, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.0496273, + "balance_loss_mlp": 1.01862288, + "epoch": 0.5810136475680875, + "flos": 20485853038080.0, + "grad_norm": 1.9382805648751418, + "language_loss": 0.81237346, + "learning_rate": 1.5755960213394091e-06, + "loss": 0.8342346, + "num_input_tokens_seen": 104210625, + "step": 4832, + "time_per_iteration": 2.4658761024475098 + }, + { + "auxiliary_loss_clip": 0.01136401, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.04745483, + "balance_loss_mlp": 1.02035367, + "epoch": 0.5811338904587267, + "flos": 17530153574400.0, + "grad_norm": 1.7978367172209813, + "language_loss": 0.78827655, + "learning_rate": 1.5748348223377703e-06, + "loss": 0.80991578, + "num_input_tokens_seen": 104228180, + "step": 4833, + "time_per_iteration": 2.5076098442077637 + }, + { + "auxiliary_loss_clip": 0.01146458, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.05074251, + "balance_loss_mlp": 1.02035737, + "epoch": 0.5812541333493657, + "flos": 19458017360640.0, + "grad_norm": 1.6187735851716132, + "language_loss": 0.77801555, + "learning_rate": 1.5740736878333507e-06, + "loss": 0.79975647, + "num_input_tokens_seen": 104246020, + "step": 4834, + "time_per_iteration": 2.4869067668914795 + }, + { + "auxiliary_loss_clip": 0.01152377, + "auxiliary_loss_mlp": 0.01024147, + "balance_loss_clip": 1.05032754, + "balance_loss_mlp": 1.01639867, + "epoch": 0.5813743762400048, + "flos": 20594949621120.0, + "grad_norm": 2.767879829741655, + "language_loss": 0.77816725, + "learning_rate": 1.5733126179416143e-06, + "loss": 0.79993248, + "num_input_tokens_seen": 104260505, + "step": 4835, + "time_per_iteration": 2.5152225494384766 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.05145264, + "balance_loss_mlp": 1.01820779, + "epoch": 0.5814946191306439, + "flos": 33178227246720.0, + "grad_norm": 2.749408754252559, + "language_loss": 0.72580385, + "learning_rate": 1.5725516127780137e-06, + "loss": 0.74768519, + "num_input_tokens_seen": 104282640, + "step": 4836, + "time_per_iteration": 2.57684326171875 + }, + { + "auxiliary_loss_clip": 0.01168099, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.04977918, + "balance_loss_mlp": 1.0219171, + "epoch": 0.581614862021283, + "flos": 16143283503360.0, + "grad_norm": 2.068136660915475, + "language_loss": 0.88011634, + "learning_rate": 1.5717906724579943e-06, + "loss": 0.90209925, + "num_input_tokens_seen": 104299700, + "step": 4837, + "time_per_iteration": 2.4465553760528564 + }, + { + "auxiliary_loss_clip": 0.01144227, + "auxiliary_loss_mlp": 0.01024046, + "balance_loss_clip": 1.05093455, + "balance_loss_mlp": 1.01672626, + "epoch": 0.581735104911922, + "flos": 33802642298880.0, + "grad_norm": 2.022580939214022, + "language_loss": 0.68180704, + "learning_rate": 1.571029797096989e-06, + "loss": 0.70348978, + "num_input_tokens_seen": 104320805, + "step": 4838, + "time_per_iteration": 2.6637215614318848 + }, + { + "auxiliary_loss_clip": 0.0117482, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.05176306, + "balance_loss_mlp": 1.02255654, + "epoch": 0.5818553478025612, + "flos": 23331163029120.0, + "grad_norm": 1.7913687619389325, + "language_loss": 0.78718591, + "learning_rate": 1.570268986810423e-06, + "loss": 0.80923277, + "num_input_tokens_seen": 104340700, + "step": 4839, + "time_per_iteration": 2.4550280570983887 + }, + { + "auxiliary_loss_clip": 0.01145935, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.04870248, + "balance_loss_mlp": 1.01996565, + "epoch": 0.5819755906932003, + "flos": 20996143603200.0, + "grad_norm": 1.985929185702165, + "language_loss": 0.74562323, + "learning_rate": 1.5695082417137096e-06, + "loss": 0.76735002, + "num_input_tokens_seen": 104358575, + "step": 4840, + "time_per_iteration": 2.4990272521972656 + }, + { + "auxiliary_loss_clip": 0.01144232, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.04634285, + "balance_loss_mlp": 1.01869965, + "epoch": 0.5820958335838393, + "flos": 21431668008960.0, + "grad_norm": 1.720241073364023, + "language_loss": 0.75308704, + "learning_rate": 1.5687475619222539e-06, + "loss": 0.77478671, + "num_input_tokens_seen": 104378530, + "step": 4841, + "time_per_iteration": 2.547492504119873 + }, + { + "auxiliary_loss_clip": 0.01141072, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.04540944, + "balance_loss_mlp": 1.01801324, + "epoch": 0.5822160764744785, + "flos": 17967473660160.0, + "grad_norm": 2.3256871300788577, + "language_loss": 0.73508853, + "learning_rate": 1.5679869475514496e-06, + "loss": 0.75675774, + "num_input_tokens_seen": 104395465, + "step": 4842, + "time_per_iteration": 3.2962448596954346 + }, + { + "auxiliary_loss_clip": 0.0116497, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.05276275, + "balance_loss_mlp": 1.02196956, + "epoch": 0.5823363193651175, + "flos": 23033858158080.0, + "grad_norm": 2.1838748952135205, + "language_loss": 0.81499696, + "learning_rate": 1.567226398716682e-06, + "loss": 0.83694774, + "num_input_tokens_seen": 104415380, + "step": 4843, + "time_per_iteration": 2.5237386226654053 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01023428, + "balance_loss_clip": 1.04970193, + "balance_loss_mlp": 1.01489604, + "epoch": 0.5824565622557566, + "flos": 32891840110080.0, + "grad_norm": 1.8471673957133083, + "language_loss": 0.6191833, + "learning_rate": 1.566465915533326e-06, + "loss": 0.64096314, + "num_input_tokens_seen": 104437410, + "step": 4844, + "time_per_iteration": 2.6162548065185547 + }, + { + "auxiliary_loss_clip": 0.01160571, + "auxiliary_loss_mlp": 0.01025233, + "balance_loss_clip": 1.0516479, + "balance_loss_mlp": 1.01764512, + "epoch": 0.5825768051463958, + "flos": 22229674513920.0, + "grad_norm": 1.9538527415128801, + "language_loss": 0.88267934, + "learning_rate": 1.5657054981167458e-06, + "loss": 0.90453744, + "num_input_tokens_seen": 104456305, + "step": 4845, + "time_per_iteration": 3.3563575744628906 + }, + { + "auxiliary_loss_clip": 0.01159165, + "auxiliary_loss_mlp": 0.0102485, + "balance_loss_clip": 1.0512867, + "balance_loss_mlp": 1.01805472, + "epoch": 0.5826970480370348, + "flos": 28001561016960.0, + "grad_norm": 2.000017001513208, + "language_loss": 0.67855966, + "learning_rate": 1.5649451465822965e-06, + "loss": 0.70039982, + "num_input_tokens_seen": 104477695, + "step": 4846, + "time_per_iteration": 2.5291800498962402 + }, + { + "auxiliary_loss_clip": 0.01119701, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.04843366, + "balance_loss_mlp": 1.02113056, + "epoch": 0.5828172909276739, + "flos": 17858053854720.0, + "grad_norm": 1.7766887218450944, + "language_loss": 0.83740079, + "learning_rate": 1.5641848610453218e-06, + "loss": 0.85888422, + "num_input_tokens_seen": 104496355, + "step": 4847, + "time_per_iteration": 2.5439999103546143 + }, + { + "auxiliary_loss_clip": 0.01159178, + "auxiliary_loss_mlp": 0.01025862, + "balance_loss_clip": 1.05213666, + "balance_loss_mlp": 1.01814342, + "epoch": 0.582937533818313, + "flos": 19865244827520.0, + "grad_norm": 2.107255172201198, + "language_loss": 0.86087084, + "learning_rate": 1.563424641621158e-06, + "loss": 0.8827213, + "num_input_tokens_seen": 104515535, + "step": 4848, + "time_per_iteration": 3.325834274291992 + }, + { + "auxiliary_loss_clip": 0.01152505, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.04987502, + "balance_loss_mlp": 1.01762342, + "epoch": 0.5830577767089521, + "flos": 26870734068480.0, + "grad_norm": 1.94068994315908, + "language_loss": 0.69884402, + "learning_rate": 1.5626644884251282e-06, + "loss": 0.72062427, + "num_input_tokens_seen": 104535055, + "step": 4849, + "time_per_iteration": 2.5377447605133057 + }, + { + "auxiliary_loss_clip": 0.01176486, + "auxiliary_loss_mlp": 0.01023693, + "balance_loss_clip": 1.05314493, + "balance_loss_mlp": 1.01675177, + "epoch": 0.5831780195995911, + "flos": 25298205575040.0, + "grad_norm": 1.5909733309391125, + "language_loss": 0.88058019, + "learning_rate": 1.5619044015725488e-06, + "loss": 0.90258199, + "num_input_tokens_seen": 104554745, + "step": 4850, + "time_per_iteration": 2.465125322341919 + }, + { + "auxiliary_loss_clip": 0.01184149, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.05670619, + "balance_loss_mlp": 1.02236032, + "epoch": 0.5832982624902303, + "flos": 14756988049920.0, + "grad_norm": 3.1330635107007696, + "language_loss": 0.86991805, + "learning_rate": 1.5611443811787224e-06, + "loss": 0.89206892, + "num_input_tokens_seen": 104568870, + "step": 4851, + "time_per_iteration": 3.156158924102783 + }, + { + "auxiliary_loss_clip": 0.01160862, + "auxiliary_loss_mlp": 0.01023479, + "balance_loss_clip": 1.05103397, + "balance_loss_mlp": 1.01597786, + "epoch": 0.5834185053808694, + "flos": 20444555376000.0, + "grad_norm": 2.2519021943582413, + "language_loss": 0.69193029, + "learning_rate": 1.560384427358945e-06, + "loss": 0.71377373, + "num_input_tokens_seen": 104588415, + "step": 4852, + "time_per_iteration": 2.4471395015716553 + }, + { + "auxiliary_loss_clip": 0.01140894, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.04445791, + "balance_loss_mlp": 1.02146542, + "epoch": 0.5835387482715084, + "flos": 27200394115200.0, + "grad_norm": 2.463433953067411, + "language_loss": 0.73123449, + "learning_rate": 1.5596245402284998e-06, + "loss": 0.75293338, + "num_input_tokens_seen": 104611940, + "step": 4853, + "time_per_iteration": 2.5585215091705322 + }, + { + "auxiliary_loss_clip": 0.01166485, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.05385637, + "balance_loss_mlp": 1.01940215, + "epoch": 0.5836589911621476, + "flos": 16654615562880.0, + "grad_norm": 1.6673030764860544, + "language_loss": 0.8178246, + "learning_rate": 1.5588647199026619e-06, + "loss": 0.83976197, + "num_input_tokens_seen": 104629675, + "step": 4854, + "time_per_iteration": 2.441371440887451 + }, + { + "auxiliary_loss_clip": 0.01183148, + "auxiliary_loss_mlp": 0.01024559, + "balance_loss_clip": 1.05733562, + "balance_loss_mlp": 1.01665592, + "epoch": 0.5837792340527866, + "flos": 20446817932800.0, + "grad_norm": 2.0831662708254886, + "language_loss": 0.87713569, + "learning_rate": 1.5581049664966956e-06, + "loss": 0.89921272, + "num_input_tokens_seen": 104647435, + "step": 4855, + "time_per_iteration": 2.4172589778900146 + }, + { + "auxiliary_loss_clip": 0.01032293, + "auxiliary_loss_mlp": 0.01003257, + "balance_loss_clip": 1.02152658, + "balance_loss_mlp": 1.00216007, + "epoch": 0.5838994769434257, + "flos": 65995480765440.0, + "grad_norm": 0.9961906891607405, + "language_loss": 0.65095389, + "learning_rate": 1.5573452801258545e-06, + "loss": 0.67130935, + "num_input_tokens_seen": 104694605, + "step": 4856, + "time_per_iteration": 2.949613332748413 + }, + { + "auxiliary_loss_clip": 0.01168817, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.05254018, + "balance_loss_mlp": 1.02656889, + "epoch": 0.5840197198340649, + "flos": 21470523546240.0, + "grad_norm": 2.360438211936364, + "language_loss": 0.63235027, + "learning_rate": 1.5565856609053824e-06, + "loss": 0.65438151, + "num_input_tokens_seen": 104713400, + "step": 4857, + "time_per_iteration": 2.4557480812072754 + }, + { + "auxiliary_loss_clip": 0.01177818, + "auxiliary_loss_mlp": 0.01025229, + "balance_loss_clip": 1.05423403, + "balance_loss_mlp": 1.01728415, + "epoch": 0.5841399627247039, + "flos": 19135144984320.0, + "grad_norm": 2.0485681925859223, + "language_loss": 0.80327988, + "learning_rate": 1.5558261089505127e-06, + "loss": 0.82531041, + "num_input_tokens_seen": 104732130, + "step": 4858, + "time_per_iteration": 2.3986682891845703 + }, + { + "auxiliary_loss_clip": 0.01164291, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.05311632, + "balance_loss_mlp": 1.01891589, + "epoch": 0.584260205615343, + "flos": 26425692558720.0, + "grad_norm": 2.6940012537396596, + "language_loss": 0.80158663, + "learning_rate": 1.5550666243764697e-06, + "loss": 0.82349348, + "num_input_tokens_seen": 104750290, + "step": 4859, + "time_per_iteration": 2.508812427520752 + }, + { + "auxiliary_loss_clip": 0.01162938, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.05108821, + "balance_loss_mlp": 1.02162242, + "epoch": 0.584380448505982, + "flos": 13881809174400.0, + "grad_norm": 2.038169763437529, + "language_loss": 0.77519751, + "learning_rate": 1.554307207298465e-06, + "loss": 0.79711789, + "num_input_tokens_seen": 104768550, + "step": 4860, + "time_per_iteration": 2.437427043914795 + }, + { + "auxiliary_loss_clip": 0.01182664, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.05650914, + "balance_loss_mlp": 1.02346015, + "epoch": 0.5845006913966212, + "flos": 21543709507200.0, + "grad_norm": 1.860142245788919, + "language_loss": 0.78734183, + "learning_rate": 1.553547857831704e-06, + "loss": 0.80948329, + "num_input_tokens_seen": 104785060, + "step": 4861, + "time_per_iteration": 2.440162181854248 + }, + { + "auxiliary_loss_clip": 0.0108054, + "auxiliary_loss_mlp": 0.01000403, + "balance_loss_clip": 1.02137733, + "balance_loss_mlp": 0.99944383, + "epoch": 0.5846209342872603, + "flos": 58375452712320.0, + "grad_norm": 0.8851175172939759, + "language_loss": 0.64188695, + "learning_rate": 1.5527885760913771e-06, + "loss": 0.66269636, + "num_input_tokens_seen": 104834950, + "step": 4862, + "time_per_iteration": 2.851132869720459 + }, + { + "auxiliary_loss_clip": 0.01147899, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.05136418, + "balance_loss_mlp": 1.01900458, + "epoch": 0.5847411771778993, + "flos": 18588045957120.0, + "grad_norm": 1.5871219602991582, + "language_loss": 0.76283348, + "learning_rate": 1.552029362192668e-06, + "loss": 0.78457618, + "num_input_tokens_seen": 104854210, + "step": 4863, + "time_per_iteration": 2.48380184173584 + }, + { + "auxiliary_loss_clip": 0.01129701, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.04677558, + "balance_loss_mlp": 1.02391434, + "epoch": 0.5848614200685385, + "flos": 24240780069120.0, + "grad_norm": 1.8966944328291402, + "language_loss": 0.72216862, + "learning_rate": 1.5512702162507478e-06, + "loss": 0.74378037, + "num_input_tokens_seen": 104874525, + "step": 4864, + "time_per_iteration": 2.5531554222106934 + }, + { + "auxiliary_loss_clip": 0.01059597, + "auxiliary_loss_mlp": 0.01001672, + "balance_loss_clip": 1.02088821, + "balance_loss_mlp": 1.00066471, + "epoch": 0.5849816629591775, + "flos": 71660245933440.0, + "grad_norm": 1.1317534729688938, + "language_loss": 0.55839682, + "learning_rate": 1.5505111383807792e-06, + "loss": 0.57900953, + "num_input_tokens_seen": 104937195, + "step": 4865, + "time_per_iteration": 3.136577606201172 + }, + { + "auxiliary_loss_clip": 0.01123058, + "auxiliary_loss_mlp": 0.01024534, + "balance_loss_clip": 1.0455687, + "balance_loss_mlp": 1.01750982, + "epoch": 0.5851019058498166, + "flos": 23802095266560.0, + "grad_norm": 1.7223347079823759, + "language_loss": 0.80646664, + "learning_rate": 1.5497521286979138e-06, + "loss": 0.82794255, + "num_input_tokens_seen": 104957435, + "step": 4866, + "time_per_iteration": 2.5836257934570312 + }, + { + "auxiliary_loss_clip": 0.01138844, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.04933167, + "balance_loss_mlp": 1.02107573, + "epoch": 0.5852221487404557, + "flos": 24388516707840.0, + "grad_norm": 1.9035069135276117, + "language_loss": 0.74253416, + "learning_rate": 1.5489931873172927e-06, + "loss": 0.76421505, + "num_input_tokens_seen": 104978755, + "step": 4867, + "time_per_iteration": 2.568427085876465 + }, + { + "auxiliary_loss_clip": 0.01087025, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.03933191, + "balance_loss_mlp": 1.02133465, + "epoch": 0.5853423916310948, + "flos": 27271425260160.0, + "grad_norm": 1.6779308275106, + "language_loss": 0.79110068, + "learning_rate": 1.5482343143540467e-06, + "loss": 0.81225848, + "num_input_tokens_seen": 105000020, + "step": 4868, + "time_per_iteration": 2.625805139541626 + }, + { + "auxiliary_loss_clip": 0.01135247, + "auxiliary_loss_mlp": 0.00761814, + "balance_loss_clip": 1.04805756, + "balance_loss_mlp": 1.00038552, + "epoch": 0.5854626345217339, + "flos": 11983786611840.0, + "grad_norm": 1.926357160625032, + "language_loss": 0.8255769, + "learning_rate": 1.547475509923295e-06, + "loss": 0.84454751, + "num_input_tokens_seen": 105017060, + "step": 4869, + "time_per_iteration": 3.3666861057281494 + }, + { + "auxiliary_loss_clip": 0.01040175, + "auxiliary_loss_mlp": 0.01002552, + "balance_loss_clip": 1.01994157, + "balance_loss_mlp": 1.0015744, + "epoch": 0.585582877412373, + "flos": 64342335173760.0, + "grad_norm": 0.7328504637338263, + "language_loss": 0.56075072, + "learning_rate": 1.5467167741401495e-06, + "loss": 0.58117795, + "num_input_tokens_seen": 105078540, + "step": 4870, + "time_per_iteration": 3.132795572280884 + }, + { + "auxiliary_loss_clip": 0.01143404, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.04508471, + "balance_loss_mlp": 1.02055144, + "epoch": 0.5857031203030121, + "flos": 17011926103680.0, + "grad_norm": 2.673943306379921, + "language_loss": 0.71054357, + "learning_rate": 1.5459581071197083e-06, + "loss": 0.73226088, + "num_input_tokens_seen": 105094200, + "step": 4871, + "time_per_iteration": 2.476003646850586 + }, + { + "auxiliary_loss_clip": 0.01168802, + "auxiliary_loss_mlp": 0.01022287, + "balance_loss_clip": 1.05585456, + "balance_loss_mlp": 1.01464605, + "epoch": 0.5858233631936511, + "flos": 20885682303360.0, + "grad_norm": 2.0639407047433993, + "language_loss": 0.83156538, + "learning_rate": 1.5451995089770624e-06, + "loss": 0.85347629, + "num_input_tokens_seen": 105113985, + "step": 4872, + "time_per_iteration": 3.325416088104248 + }, + { + "auxiliary_loss_clip": 0.01175918, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.05331874, + "balance_loss_mlp": 1.01719499, + "epoch": 0.5859436060842903, + "flos": 23191902000000.0, + "grad_norm": 9.177022466088836, + "language_loss": 0.7179935, + "learning_rate": 1.5444409798272885e-06, + "loss": 0.73999512, + "num_input_tokens_seen": 105138075, + "step": 4873, + "time_per_iteration": 2.550305128097534 + }, + { + "auxiliary_loss_clip": 0.01136295, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.04809999, + "balance_loss_mlp": 1.02355802, + "epoch": 0.5860638489749294, + "flos": 22492648961280.0, + "grad_norm": 2.7690078253871917, + "language_loss": 0.80556107, + "learning_rate": 1.543682519785456e-06, + "loss": 0.82723439, + "num_input_tokens_seen": 105156555, + "step": 4874, + "time_per_iteration": 2.590604066848755 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.05019605, + "balance_loss_mlp": 1.02231646, + "epoch": 0.5861840918655684, + "flos": 17566243764480.0, + "grad_norm": 2.4416744322458652, + "language_loss": 0.80483055, + "learning_rate": 1.5429241289666219e-06, + "loss": 0.82661021, + "num_input_tokens_seen": 105174055, + "step": 4875, + "time_per_iteration": 3.315791130065918 + }, + { + "auxiliary_loss_clip": 0.01140726, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04901659, + "balance_loss_mlp": 1.01955175, + "epoch": 0.5863043347562076, + "flos": 25556152118400.0, + "grad_norm": 1.9229775416406538, + "language_loss": 0.69987988, + "learning_rate": 1.5421658074858342e-06, + "loss": 0.72155571, + "num_input_tokens_seen": 105192160, + "step": 4876, + "time_per_iteration": 2.535771131515503 + }, + { + "auxiliary_loss_clip": 0.01145201, + "auxiliary_loss_mlp": 0.01029946, + "balance_loss_clip": 1.05000031, + "balance_loss_mlp": 1.02225113, + "epoch": 0.5864245776468466, + "flos": 20667525050880.0, + "grad_norm": 2.7927631739534875, + "language_loss": 0.66187465, + "learning_rate": 1.5414075554581298e-06, + "loss": 0.68362617, + "num_input_tokens_seen": 105210205, + "step": 4877, + "time_per_iteration": 3.263946771621704 + }, + { + "auxiliary_loss_clip": 0.01178424, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.05255139, + "balance_loss_mlp": 1.01991558, + "epoch": 0.5865448205374857, + "flos": 28913907490560.0, + "grad_norm": 2.465698080662706, + "language_loss": 0.78675997, + "learning_rate": 1.5406493729985348e-06, + "loss": 0.80881584, + "num_input_tokens_seen": 105229400, + "step": 4878, + "time_per_iteration": 2.476013422012329 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.00762172, + "balance_loss_clip": 1.04953027, + "balance_loss_mlp": 1.00028741, + "epoch": 0.5866650634281249, + "flos": 25842575168640.0, + "grad_norm": 2.697156231673269, + "language_loss": 0.7248143, + "learning_rate": 1.5398912602220644e-06, + "loss": 0.74367952, + "num_input_tokens_seen": 105248675, + "step": 4879, + "time_per_iteration": 2.583902359008789 + }, + { + "auxiliary_loss_clip": 0.01133181, + "auxiliary_loss_mlp": 0.01027374, + "balance_loss_clip": 1.0483495, + "balance_loss_mlp": 1.01992297, + "epoch": 0.5867853063187639, + "flos": 17052325925760.0, + "grad_norm": 2.2094827278864617, + "language_loss": 0.78712022, + "learning_rate": 1.539133217243724e-06, + "loss": 0.80872571, + "num_input_tokens_seen": 105265695, + "step": 4880, + "time_per_iteration": 2.518782615661621 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.04864419, + "balance_loss_mlp": 1.01960599, + "epoch": 0.586905549209403, + "flos": 24645026707200.0, + "grad_norm": 2.0988796510168375, + "language_loss": 0.76015568, + "learning_rate": 1.5383752441785081e-06, + "loss": 0.78182948, + "num_input_tokens_seen": 105284920, + "step": 4881, + "time_per_iteration": 2.5902581214904785 + }, + { + "auxiliary_loss_clip": 0.01167349, + "auxiliary_loss_mlp": 0.01035072, + "balance_loss_clip": 1.05216777, + "balance_loss_mlp": 1.02748466, + "epoch": 0.5870257921000421, + "flos": 14720538723840.0, + "grad_norm": 2.12705509228889, + "language_loss": 0.85114139, + "learning_rate": 1.5376173411414003e-06, + "loss": 0.87316561, + "num_input_tokens_seen": 105302960, + "step": 4882, + "time_per_iteration": 2.4419870376586914 + }, + { + "auxiliary_loss_clip": 0.01148606, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.04713047, + "balance_loss_mlp": 1.02263618, + "epoch": 0.5871460349906812, + "flos": 23914998691200.0, + "grad_norm": 2.286116162266255, + "language_loss": 0.78641808, + "learning_rate": 1.5368595082473753e-06, + "loss": 0.80821216, + "num_input_tokens_seen": 105321260, + "step": 4883, + "time_per_iteration": 2.5248355865478516 + }, + { + "auxiliary_loss_clip": 0.01163735, + "auxiliary_loss_mlp": 0.01020461, + "balance_loss_clip": 1.0496695, + "balance_loss_mlp": 1.01328456, + "epoch": 0.5872662778813202, + "flos": 22164174063360.0, + "grad_norm": 1.6252387607282661, + "language_loss": 0.77787477, + "learning_rate": 1.5361017456113935e-06, + "loss": 0.79971671, + "num_input_tokens_seen": 105341610, + "step": 4884, + "time_per_iteration": 2.4605295658111572 + }, + { + "auxiliary_loss_clip": 0.01165495, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.05140829, + "balance_loss_mlp": 1.01914513, + "epoch": 0.5873865207719594, + "flos": 18441925430400.0, + "grad_norm": 1.9039322470754632, + "language_loss": 0.86321974, + "learning_rate": 1.5353440533484085e-06, + "loss": 0.88514519, + "num_input_tokens_seen": 105360465, + "step": 4885, + "time_per_iteration": 2.44521427154541 + }, + { + "auxiliary_loss_clip": 0.01151262, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.05050683, + "balance_loss_mlp": 1.02125978, + "epoch": 0.5875067636625985, + "flos": 54015321427200.0, + "grad_norm": 1.8363400797070706, + "language_loss": 0.65927649, + "learning_rate": 1.534586431573361e-06, + "loss": 0.6810807, + "num_input_tokens_seen": 105385405, + "step": 4886, + "time_per_iteration": 2.792919397354126 + }, + { + "auxiliary_loss_clip": 0.01106488, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.04269648, + "balance_loss_mlp": 1.01675391, + "epoch": 0.5876270065532375, + "flos": 27995707100160.0, + "grad_norm": 1.8618837842434977, + "language_loss": 0.78928101, + "learning_rate": 1.5338288804011817e-06, + "loss": 0.81060272, + "num_input_tokens_seen": 105404905, + "step": 4887, + "time_per_iteration": 2.654822826385498 + }, + { + "auxiliary_loss_clip": 0.0114405, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.04710913, + "balance_loss_mlp": 1.02238119, + "epoch": 0.5877472494438767, + "flos": 21361462876800.0, + "grad_norm": 1.862960841958464, + "language_loss": 0.71203065, + "learning_rate": 1.533071399946791e-06, + "loss": 0.73377466, + "num_input_tokens_seen": 105423650, + "step": 4888, + "time_per_iteration": 2.4955098628997803 + }, + { + "auxiliary_loss_clip": 0.01150916, + "auxiliary_loss_mlp": 0.01026224, + "balance_loss_clip": 1.04857135, + "balance_loss_mlp": 1.01899385, + "epoch": 0.5878674923345157, + "flos": 22383013674240.0, + "grad_norm": 2.0136479059696826, + "language_loss": 0.57435399, + "learning_rate": 1.5323139903250977e-06, + "loss": 0.59612542, + "num_input_tokens_seen": 105444255, + "step": 4889, + "time_per_iteration": 2.533618688583374 + }, + { + "auxiliary_loss_clip": 0.01151524, + "auxiliary_loss_mlp": 0.01024734, + "balance_loss_clip": 1.05260658, + "balance_loss_mlp": 1.01745296, + "epoch": 0.5879877352251548, + "flos": 21868664872320.0, + "grad_norm": 1.7949150722762295, + "language_loss": 0.77045381, + "learning_rate": 1.5315566516510002e-06, + "loss": 0.79221642, + "num_input_tokens_seen": 105462425, + "step": 4890, + "time_per_iteration": 2.495255708694458 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.05443501, + "balance_loss_mlp": 1.01940525, + "epoch": 0.5881079781157939, + "flos": 17493811989120.0, + "grad_norm": 1.7239018469422087, + "language_loss": 0.67731953, + "learning_rate": 1.5307993840393857e-06, + "loss": 0.69937086, + "num_input_tokens_seen": 105480505, + "step": 4891, + "time_per_iteration": 2.4426653385162354 + }, + { + "auxiliary_loss_clip": 0.0117616, + "auxiliary_loss_mlp": 0.0102398, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.01665497, + "epoch": 0.588228221006433, + "flos": 22601853285120.0, + "grad_norm": 1.846636804355916, + "language_loss": 0.80427814, + "learning_rate": 1.530042187605132e-06, + "loss": 0.82627952, + "num_input_tokens_seen": 105499760, + "step": 4892, + "time_per_iteration": 2.4386227130889893 + }, + { + "auxiliary_loss_clip": 0.01162774, + "auxiliary_loss_mlp": 0.00761443, + "balance_loss_clip": 1.05201054, + "balance_loss_mlp": 1.00024509, + "epoch": 0.5883484638970721, + "flos": 26176939896960.0, + "grad_norm": 1.3633597318893786, + "language_loss": 0.84165776, + "learning_rate": 1.5292850624631044e-06, + "loss": 0.86089993, + "num_input_tokens_seen": 105521955, + "step": 4893, + "time_per_iteration": 2.5085768699645996 + }, + { + "auxiliary_loss_clip": 0.01160156, + "auxiliary_loss_mlp": 0.01028469, + "balance_loss_clip": 1.05295575, + "balance_loss_mlp": 1.02048826, + "epoch": 0.5884687067877111, + "flos": 30443737691520.0, + "grad_norm": 1.909389229039017, + "language_loss": 0.80411786, + "learning_rate": 1.5285280087281593e-06, + "loss": 0.82600415, + "num_input_tokens_seen": 105542685, + "step": 4894, + "time_per_iteration": 2.5591704845428467 + }, + { + "auxiliary_loss_clip": 0.01061128, + "auxiliary_loss_mlp": 0.01001673, + "balance_loss_clip": 1.02113819, + "balance_loss_mlp": 1.0007019, + "epoch": 0.5885889496783503, + "flos": 70507550580480.0, + "grad_norm": 0.6507384048749758, + "language_loss": 0.56575453, + "learning_rate": 1.5277710265151398e-06, + "loss": 0.58638263, + "num_input_tokens_seen": 105612165, + "step": 4895, + "time_per_iteration": 3.230508804321289 + }, + { + "auxiliary_loss_clip": 0.0116331, + "auxiliary_loss_mlp": 0.01023341, + "balance_loss_clip": 1.05184925, + "balance_loss_mlp": 1.01520443, + "epoch": 0.5887091925689893, + "flos": 19098767485440.0, + "grad_norm": 2.3995892345281353, + "language_loss": 0.77127254, + "learning_rate": 1.5270141159388803e-06, + "loss": 0.7931391, + "num_input_tokens_seen": 105629185, + "step": 4896, + "time_per_iteration": 3.265119791030884 + }, + { + "auxiliary_loss_clip": 0.01175512, + "auxiliary_loss_mlp": 0.010242, + "balance_loss_clip": 1.0510962, + "balance_loss_mlp": 1.01646328, + "epoch": 0.5888294354596284, + "flos": 23294282739840.0, + "grad_norm": 1.6262157937744788, + "language_loss": 0.80495906, + "learning_rate": 1.526257277114203e-06, + "loss": 0.82695621, + "num_input_tokens_seen": 105650260, + "step": 4897, + "time_per_iteration": 2.4233932495117188 + }, + { + "auxiliary_loss_clip": 0.01143025, + "auxiliary_loss_mlp": 0.01024012, + "balance_loss_clip": 1.04937696, + "balance_loss_mlp": 1.01666224, + "epoch": 0.5889496783502676, + "flos": 21981532383360.0, + "grad_norm": 1.8312968996154904, + "language_loss": 0.79475182, + "learning_rate": 1.5255005101559201e-06, + "loss": 0.81642222, + "num_input_tokens_seen": 105667870, + "step": 4898, + "time_per_iteration": 2.489795684814453 + }, + { + "auxiliary_loss_clip": 0.01166231, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.05195093, + "balance_loss_mlp": 1.01932025, + "epoch": 0.5890699212409066, + "flos": 21685233093120.0, + "grad_norm": 2.1078549330762324, + "language_loss": 0.76870197, + "learning_rate": 1.524743815178833e-06, + "loss": 0.7906279, + "num_input_tokens_seen": 105685830, + "step": 4899, + "time_per_iteration": 3.3152313232421875 + }, + { + "auxiliary_loss_clip": 0.0114926, + "auxiliary_loss_mlp": 0.01024229, + "balance_loss_clip": 1.04817283, + "balance_loss_mlp": 1.0168587, + "epoch": 0.5891901641315457, + "flos": 19464553635840.0, + "grad_norm": 3.166271870474221, + "language_loss": 0.8101716, + "learning_rate": 1.5239871922977315e-06, + "loss": 0.83190656, + "num_input_tokens_seen": 105705745, + "step": 4900, + "time_per_iteration": 2.5020203590393066 + }, + { + "auxiliary_loss_clip": 0.01145567, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.04646659, + "balance_loss_mlp": 1.02446473, + "epoch": 0.5893104070221848, + "flos": 19609884063360.0, + "grad_norm": 1.846445680777024, + "language_loss": 0.8989383, + "learning_rate": 1.523230641627394e-06, + "loss": 0.92071652, + "num_input_tokens_seen": 105724730, + "step": 4901, + "time_per_iteration": 3.3320086002349854 + }, + { + "auxiliary_loss_clip": 0.01122301, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.04358888, + "balance_loss_mlp": 1.01954246, + "epoch": 0.5894306499128239, + "flos": 29060063930880.0, + "grad_norm": 1.9612376460470438, + "language_loss": 0.72871953, + "learning_rate": 1.5224741632825888e-06, + "loss": 0.75021231, + "num_input_tokens_seen": 105744920, + "step": 4902, + "time_per_iteration": 2.615467071533203 + }, + { + "auxiliary_loss_clip": 0.01181289, + "auxiliary_loss_mlp": 0.01024252, + "balance_loss_clip": 1.05576873, + "balance_loss_mlp": 1.01619053, + "epoch": 0.589550892803463, + "flos": 42298890721920.0, + "grad_norm": 1.6752828781043458, + "language_loss": 0.69171691, + "learning_rate": 1.521717757378074e-06, + "loss": 0.7137723, + "num_input_tokens_seen": 105765465, + "step": 4903, + "time_per_iteration": 2.6308889389038086 + }, + { + "auxiliary_loss_clip": 0.01168553, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.05305803, + "balance_loss_mlp": 1.02199602, + "epoch": 0.5896711356941021, + "flos": 14137062197760.0, + "grad_norm": 1.8413498162275888, + "language_loss": 0.69518149, + "learning_rate": 1.5209614240285943e-06, + "loss": 0.7171675, + "num_input_tokens_seen": 105783120, + "step": 4904, + "time_per_iteration": 3.2270171642303467 + }, + { + "auxiliary_loss_clip": 0.01174072, + "auxiliary_loss_mlp": 0.00761931, + "balance_loss_clip": 1.04998147, + "balance_loss_mlp": 1.00037193, + "epoch": 0.5897913785847412, + "flos": 17201355454080.0, + "grad_norm": 2.593005582368182, + "language_loss": 0.85191512, + "learning_rate": 1.520205163348887e-06, + "loss": 0.87127519, + "num_input_tokens_seen": 105801055, + "step": 4905, + "time_per_iteration": 2.4040586948394775 + }, + { + "auxiliary_loss_clip": 0.01051347, + "auxiliary_loss_mlp": 0.010015, + "balance_loss_clip": 1.01999092, + "balance_loss_mlp": 1.0003916, + "epoch": 0.5899116214753802, + "flos": 48794164202880.0, + "grad_norm": 0.7262631845310534, + "language_loss": 0.57018745, + "learning_rate": 1.519448975453674e-06, + "loss": 0.590716, + "num_input_tokens_seen": 105856155, + "step": 4906, + "time_per_iteration": 2.957052707672119 + }, + { + "auxiliary_loss_clip": 0.01164717, + "auxiliary_loss_mlp": 0.00762303, + "balance_loss_clip": 1.05371881, + "balance_loss_mlp": 1.00042319, + "epoch": 0.5900318643660194, + "flos": 21103659987840.0, + "grad_norm": 2.197850812735842, + "language_loss": 0.75720716, + "learning_rate": 1.5186928604576696e-06, + "loss": 0.77647734, + "num_input_tokens_seen": 105873350, + "step": 4907, + "time_per_iteration": 2.457343101501465 + }, + { + "auxiliary_loss_clip": 0.01147913, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.04785371, + "balance_loss_mlp": 1.02092731, + "epoch": 0.5901521072566585, + "flos": 21178390233600.0, + "grad_norm": 2.07140024042001, + "language_loss": 0.76665926, + "learning_rate": 1.5179368184755752e-06, + "loss": 0.78842217, + "num_input_tokens_seen": 105891435, + "step": 4908, + "time_per_iteration": 2.48539400100708 + }, + { + "auxiliary_loss_clip": 0.01146949, + "auxiliary_loss_mlp": 0.0102247, + "balance_loss_clip": 1.05015516, + "balance_loss_mlp": 1.01515639, + "epoch": 0.5902723501472975, + "flos": 20225967160320.0, + "grad_norm": 1.5581074641132817, + "language_loss": 0.82744849, + "learning_rate": 1.5171808496220821e-06, + "loss": 0.84914261, + "num_input_tokens_seen": 105910190, + "step": 4909, + "time_per_iteration": 2.4843177795410156 + }, + { + "auxiliary_loss_clip": 0.01153602, + "auxiliary_loss_mlp": 0.01025007, + "balance_loss_clip": 1.0498457, + "balance_loss_mlp": 1.01782131, + "epoch": 0.5903925930379367, + "flos": 22964407211520.0, + "grad_norm": 1.711347472887412, + "language_loss": 0.81179714, + "learning_rate": 1.5164249540118708e-06, + "loss": 0.83358324, + "num_input_tokens_seen": 105929315, + "step": 4910, + "time_per_iteration": 2.5136327743530273 + }, + { + "auxiliary_loss_clip": 0.01108093, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.04377532, + "balance_loss_mlp": 1.0202291, + "epoch": 0.5905128359285757, + "flos": 23367720096000.0, + "grad_norm": 1.7194469707999254, + "language_loss": 0.83124453, + "learning_rate": 1.5156691317596093e-06, + "loss": 0.8526057, + "num_input_tokens_seen": 105950740, + "step": 4911, + "time_per_iteration": 2.5977957248687744 + }, + { + "auxiliary_loss_clip": 0.01166619, + "auxiliary_loss_mlp": 0.00762103, + "balance_loss_clip": 1.05225146, + "balance_loss_mlp": 1.00041032, + "epoch": 0.5906330788192148, + "flos": 28032335994240.0, + "grad_norm": 2.351751233723427, + "language_loss": 0.66862053, + "learning_rate": 1.5149133829799556e-06, + "loss": 0.6879077, + "num_input_tokens_seen": 105968735, + "step": 4912, + "time_per_iteration": 2.5219383239746094 + }, + { + "auxiliary_loss_clip": 0.01154809, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.04871333, + "balance_loss_mlp": 1.02611589, + "epoch": 0.590753321709854, + "flos": 18477943793280.0, + "grad_norm": 2.1764494683531366, + "language_loss": 0.8045311, + "learning_rate": 1.5141577077875556e-06, + "loss": 0.82642019, + "num_input_tokens_seen": 105986060, + "step": 4913, + "time_per_iteration": 2.460291624069214 + }, + { + "auxiliary_loss_clip": 0.01166466, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.05300474, + "balance_loss_mlp": 1.01912963, + "epoch": 0.590873564600493, + "flos": 16873706568960.0, + "grad_norm": 1.7998377767036837, + "language_loss": 0.72579432, + "learning_rate": 1.5134021062970451e-06, + "loss": 0.74772632, + "num_input_tokens_seen": 106004440, + "step": 4914, + "time_per_iteration": 2.4363203048706055 + }, + { + "auxiliary_loss_clip": 0.01130013, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.05197072, + "balance_loss_mlp": 1.02078462, + "epoch": 0.5909938074911321, + "flos": 13516166678400.0, + "grad_norm": 2.0512883741565373, + "language_loss": 0.80964404, + "learning_rate": 1.5126465786230483e-06, + "loss": 0.83122927, + "num_input_tokens_seen": 106021215, + "step": 4915, + "time_per_iteration": 2.478245258331299 + }, + { + "auxiliary_loss_clip": 0.01175081, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.05142224, + "balance_loss_mlp": 1.01912951, + "epoch": 0.5911140503817712, + "flos": 26024067613440.0, + "grad_norm": 1.7527029326285188, + "language_loss": 0.82141525, + "learning_rate": 1.5118911248801787e-06, + "loss": 0.84343535, + "num_input_tokens_seen": 106039225, + "step": 4916, + "time_per_iteration": 2.444098949432373 + }, + { + "auxiliary_loss_clip": 0.01159578, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.05018401, + "balance_loss_mlp": 1.02036572, + "epoch": 0.5912342932724103, + "flos": 23258731253760.0, + "grad_norm": 1.979301126389231, + "language_loss": 0.79502845, + "learning_rate": 1.5111357451830364e-06, + "loss": 0.81689668, + "num_input_tokens_seen": 106057920, + "step": 4917, + "time_per_iteration": 2.460336923599243 + }, + { + "auxiliary_loss_clip": 0.01162486, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.05022955, + "balance_loss_mlp": 1.01943111, + "epoch": 0.5913545361630493, + "flos": 19573039687680.0, + "grad_norm": 1.9666009456757982, + "language_loss": 0.71161103, + "learning_rate": 1.5103804396462131e-06, + "loss": 0.73350626, + "num_input_tokens_seen": 106077855, + "step": 4918, + "time_per_iteration": 2.4605956077575684 + }, + { + "auxiliary_loss_clip": 0.01165644, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.04921901, + "balance_loss_mlp": 1.02252448, + "epoch": 0.5914747790536885, + "flos": 26213532877440.0, + "grad_norm": 3.088505184687068, + "language_loss": 0.79835898, + "learning_rate": 1.5096252083842877e-06, + "loss": 0.8203218, + "num_input_tokens_seen": 106097065, + "step": 4919, + "time_per_iteration": 2.4927666187286377 + }, + { + "auxiliary_loss_clip": 0.01158832, + "auxiliary_loss_mlp": 0.01024323, + "balance_loss_clip": 1.04779673, + "balance_loss_mlp": 1.01669347, + "epoch": 0.5915950219443276, + "flos": 27417545786880.0, + "grad_norm": 1.6976556706417651, + "language_loss": 0.84898341, + "learning_rate": 1.5088700515118285e-06, + "loss": 0.87081504, + "num_input_tokens_seen": 106116385, + "step": 4920, + "time_per_iteration": 2.503058910369873 + }, + { + "auxiliary_loss_clip": 0.01130942, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.04954422, + "balance_loss_mlp": 1.02028191, + "epoch": 0.5917152648349666, + "flos": 21907879545600.0, + "grad_norm": 1.6698580532702783, + "language_loss": 0.66167057, + "learning_rate": 1.508114969143392e-06, + "loss": 0.68326449, + "num_input_tokens_seen": 106136370, + "step": 4921, + "time_per_iteration": 2.513958692550659 + }, + { + "auxiliary_loss_clip": 0.01149561, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.04787827, + "balance_loss_mlp": 1.0225482, + "epoch": 0.5918355077256057, + "flos": 28109185142400.0, + "grad_norm": 1.783833279866674, + "language_loss": 0.77339721, + "learning_rate": 1.5073599613935238e-06, + "loss": 0.7951898, + "num_input_tokens_seen": 106158490, + "step": 4922, + "time_per_iteration": 3.415992259979248 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.04928064, + "balance_loss_mlp": 1.01992917, + "epoch": 0.5919557506162448, + "flos": 28183807647360.0, + "grad_norm": 2.123727696147373, + "language_loss": 0.57717514, + "learning_rate": 1.5066050283767574e-06, + "loss": 0.59894753, + "num_input_tokens_seen": 106179170, + "step": 4923, + "time_per_iteration": 2.5488946437835693 + }, + { + "auxiliary_loss_clip": 0.0114406, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.04959166, + "balance_loss_mlp": 1.02095032, + "epoch": 0.5920759935068839, + "flos": 12094355652480.0, + "grad_norm": 1.9618520512369764, + "language_loss": 0.82455289, + "learning_rate": 1.505850170207616e-06, + "loss": 0.84627688, + "num_input_tokens_seen": 106196035, + "step": 4924, + "time_per_iteration": 2.5083770751953125 + }, + { + "auxiliary_loss_clip": 0.0114584, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.04682088, + "balance_loss_mlp": 1.01937485, + "epoch": 0.592196236397523, + "flos": 29424772673280.0, + "grad_norm": 2.189166603395096, + "language_loss": 0.7800976, + "learning_rate": 1.505095387000611e-06, + "loss": 0.80182314, + "num_input_tokens_seen": 106218335, + "step": 4925, + "time_per_iteration": 2.5520801544189453 + }, + { + "auxiliary_loss_clip": 0.01138334, + "auxiliary_loss_mlp": 0.01026449, + "balance_loss_clip": 1.0482161, + "balance_loss_mlp": 1.01918066, + "epoch": 0.5923164792881621, + "flos": 24384709866240.0, + "grad_norm": 2.065637656285934, + "language_loss": 0.74177647, + "learning_rate": 1.504340678870242e-06, + "loss": 0.76342428, + "num_input_tokens_seen": 106236550, + "step": 4926, + "time_per_iteration": 3.3669991493225098 + }, + { + "auxiliary_loss_clip": 0.01162038, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.05101717, + "balance_loss_mlp": 1.02484989, + "epoch": 0.5924367221788012, + "flos": 24024238928640.0, + "grad_norm": 1.9737458741265004, + "language_loss": 0.89623493, + "learning_rate": 1.5035860459309989e-06, + "loss": 0.91817546, + "num_input_tokens_seen": 106254265, + "step": 4927, + "time_per_iteration": 2.4698445796966553 + }, + { + "auxiliary_loss_clip": 0.01143996, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.04748595, + "balance_loss_mlp": 1.02116728, + "epoch": 0.5925569650694402, + "flos": 26870590414080.0, + "grad_norm": 1.8080600223539682, + "language_loss": 0.63587487, + "learning_rate": 1.5028314882973568e-06, + "loss": 0.6576103, + "num_input_tokens_seen": 106274670, + "step": 4928, + "time_per_iteration": 3.3737740516662598 + }, + { + "auxiliary_loss_clip": 0.01151784, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.05230355, + "balance_loss_mlp": 1.02272153, + "epoch": 0.5926772079600794, + "flos": 22302788647680.0, + "grad_norm": 2.645334717007061, + "language_loss": 0.84577751, + "learning_rate": 1.502077006083783e-06, + "loss": 0.86760241, + "num_input_tokens_seen": 106293330, + "step": 4929, + "time_per_iteration": 2.4817793369293213 + }, + { + "auxiliary_loss_clip": 0.01167471, + "auxiliary_loss_mlp": 0.00761328, + "balance_loss_clip": 1.05173385, + "balance_loss_mlp": 1.00029147, + "epoch": 0.5927974508507184, + "flos": 19865244827520.0, + "grad_norm": 1.7331667546283522, + "language_loss": 0.76869094, + "learning_rate": 1.5013225994047315e-06, + "loss": 0.78797901, + "num_input_tokens_seen": 106310960, + "step": 4930, + "time_per_iteration": 2.4470012187957764 + }, + { + "auxiliary_loss_clip": 0.01165454, + "auxiliary_loss_mlp": 0.00761324, + "balance_loss_clip": 1.05366147, + "balance_loss_mlp": 1.00030732, + "epoch": 0.5929176937413575, + "flos": 15776743167360.0, + "grad_norm": 1.5171864093309735, + "language_loss": 0.80508769, + "learning_rate": 1.5005682683746452e-06, + "loss": 0.82435542, + "num_input_tokens_seen": 106329475, + "step": 4931, + "time_per_iteration": 3.183997869491577 + }, + { + "auxiliary_loss_clip": 0.01163788, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.05344498, + "balance_loss_mlp": 1.02032518, + "epoch": 0.5930379366319967, + "flos": 17601472028160.0, + "grad_norm": 2.2819889038742365, + "language_loss": 0.72695696, + "learning_rate": 1.4998140131079553e-06, + "loss": 0.74887455, + "num_input_tokens_seen": 106345565, + "step": 4932, + "time_per_iteration": 2.418487310409546 + }, + { + "auxiliary_loss_clip": 0.01102791, + "auxiliary_loss_mlp": 0.00761458, + "balance_loss_clip": 1.04363, + "balance_loss_mlp": 1.00026798, + "epoch": 0.5931581795226357, + "flos": 17704283731200.0, + "grad_norm": 1.7946972982747897, + "language_loss": 0.73921323, + "learning_rate": 1.4990598337190821e-06, + "loss": 0.75785571, + "num_input_tokens_seen": 106361920, + "step": 4933, + "time_per_iteration": 2.531195640563965 + }, + { + "auxiliary_loss_clip": 0.01175481, + "auxiliary_loss_mlp": 0.00762408, + "balance_loss_clip": 1.05142677, + "balance_loss_mlp": 1.00027764, + "epoch": 0.5932784224132748, + "flos": 24280102483200.0, + "grad_norm": 1.8443745540758933, + "language_loss": 0.67920095, + "learning_rate": 1.4983057303224338e-06, + "loss": 0.69857979, + "num_input_tokens_seen": 106381735, + "step": 4934, + "time_per_iteration": 2.4451987743377686 + }, + { + "auxiliary_loss_clip": 0.01118944, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.04617429, + "balance_loss_mlp": 1.0221796, + "epoch": 0.5933986653039139, + "flos": 22926700909440.0, + "grad_norm": 1.6630106734573074, + "language_loss": 0.87887239, + "learning_rate": 1.4975517030324072e-06, + "loss": 0.90036064, + "num_input_tokens_seen": 106399745, + "step": 4935, + "time_per_iteration": 2.543041944503784 + }, + { + "auxiliary_loss_clip": 0.01073839, + "auxiliary_loss_mlp": 0.00752771, + "balance_loss_clip": 1.01507378, + "balance_loss_mlp": 0.9997347, + "epoch": 0.593518908194553, + "flos": 71121730256640.0, + "grad_norm": 0.7858956376765502, + "language_loss": 0.61843598, + "learning_rate": 1.4967977519633882e-06, + "loss": 0.63670206, + "num_input_tokens_seen": 106457205, + "step": 4936, + "time_per_iteration": 3.118154525756836 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.04871881, + "balance_loss_mlp": 1.0169394, + "epoch": 0.593639151085192, + "flos": 20448649526400.0, + "grad_norm": 1.9104499777932717, + "language_loss": 0.78172088, + "learning_rate": 1.4960438772297494e-06, + "loss": 0.80330694, + "num_input_tokens_seen": 106474250, + "step": 4937, + "time_per_iteration": 2.512204647064209 + }, + { + "auxiliary_loss_clip": 0.01149633, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.04809988, + "balance_loss_mlp": 1.02007508, + "epoch": 0.5937593939758312, + "flos": 30883428074880.0, + "grad_norm": 2.2284603403364525, + "language_loss": 0.73346782, + "learning_rate": 1.495290078945855e-06, + "loss": 0.75524038, + "num_input_tokens_seen": 106494015, + "step": 4938, + "time_per_iteration": 2.5780420303344727 + }, + { + "auxiliary_loss_clip": 0.01175281, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.05246115, + "balance_loss_mlp": 1.02158952, + "epoch": 0.5938796368664703, + "flos": 36898069668480.0, + "grad_norm": 2.3131837347137636, + "language_loss": 0.73908448, + "learning_rate": 1.4945363572260529e-06, + "loss": 0.76113027, + "num_input_tokens_seen": 106515010, + "step": 4939, + "time_per_iteration": 2.553023099899292 + }, + { + "auxiliary_loss_clip": 0.01162091, + "auxiliary_loss_mlp": 0.0102468, + "balance_loss_clip": 1.04995251, + "balance_loss_mlp": 1.01750398, + "epoch": 0.5939998797571093, + "flos": 23842926051840.0, + "grad_norm": 2.280127823288347, + "language_loss": 0.67816079, + "learning_rate": 1.4937827121846845e-06, + "loss": 0.70002848, + "num_input_tokens_seen": 106535265, + "step": 4940, + "time_per_iteration": 2.52632474899292 + }, + { + "auxiliary_loss_clip": 0.01133648, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.0529623, + "balance_loss_mlp": 1.02721667, + "epoch": 0.5941201226477485, + "flos": 25191407462400.0, + "grad_norm": 1.513079214174461, + "language_loss": 0.73617417, + "learning_rate": 1.4930291439360755e-06, + "loss": 0.75785756, + "num_input_tokens_seen": 106557830, + "step": 4941, + "time_per_iteration": 2.567063570022583 + }, + { + "auxiliary_loss_clip": 0.0116539, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.05236495, + "balance_loss_mlp": 1.02051449, + "epoch": 0.5942403655383875, + "flos": 22418996123520.0, + "grad_norm": 1.7173236475614415, + "language_loss": 0.79456139, + "learning_rate": 1.4922756525945427e-06, + "loss": 0.81650585, + "num_input_tokens_seen": 106577140, + "step": 4942, + "time_per_iteration": 2.447849988937378 + }, + { + "auxiliary_loss_clip": 0.01063626, + "auxiliary_loss_mlp": 0.01001372, + "balance_loss_clip": 1.01418185, + "balance_loss_mlp": 1.0002749, + "epoch": 0.5943606084290266, + "flos": 67629310796160.0, + "grad_norm": 1.176669132312266, + "language_loss": 0.59675837, + "learning_rate": 1.4915222382743894e-06, + "loss": 0.61740828, + "num_input_tokens_seen": 106635975, + "step": 4943, + "time_per_iteration": 3.1054978370666504 + }, + { + "auxiliary_loss_clip": 0.01165791, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.05288613, + "balance_loss_mlp": 1.01963186, + "epoch": 0.5944808513196658, + "flos": 18223157646720.0, + "grad_norm": 2.511971399840203, + "language_loss": 0.71873939, + "learning_rate": 1.4907689010899085e-06, + "loss": 0.74067211, + "num_input_tokens_seen": 106653555, + "step": 4944, + "time_per_iteration": 2.431511640548706 + }, + { + "auxiliary_loss_clip": 0.01148534, + "auxiliary_loss_mlp": 0.01024146, + "balance_loss_clip": 1.04858875, + "balance_loss_mlp": 1.01636744, + "epoch": 0.5946010942103048, + "flos": 24790824011520.0, + "grad_norm": 2.005001366906365, + "language_loss": 0.6286431, + "learning_rate": 1.4900156411553804e-06, + "loss": 0.65036988, + "num_input_tokens_seen": 106673385, + "step": 4945, + "time_per_iteration": 2.5283596515655518 + }, + { + "auxiliary_loss_clip": 0.01153577, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.05174875, + "balance_loss_mlp": 1.01916051, + "epoch": 0.5947213371009439, + "flos": 15231619388160.0, + "grad_norm": 1.8912554429473638, + "language_loss": 0.85471523, + "learning_rate": 1.4892624585850739e-06, + "loss": 0.87651777, + "num_input_tokens_seen": 106691740, + "step": 4946, + "time_per_iteration": 2.4547576904296875 + }, + { + "auxiliary_loss_clip": 0.01180647, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.05407166, + "balance_loss_mlp": 1.02195621, + "epoch": 0.594841579991583, + "flos": 25848069949440.0, + "grad_norm": 1.948354772301302, + "language_loss": 0.79370081, + "learning_rate": 1.4885093534932465e-06, + "loss": 0.81580639, + "num_input_tokens_seen": 106709705, + "step": 4947, + "time_per_iteration": 2.451784610748291 + }, + { + "auxiliary_loss_clip": 0.01149525, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.05171728, + "balance_loss_mlp": 1.01985598, + "epoch": 0.5949618228822221, + "flos": 23981109672960.0, + "grad_norm": 2.1113095811427676, + "language_loss": 0.71349955, + "learning_rate": 1.4877563259941433e-06, + "loss": 0.7352742, + "num_input_tokens_seen": 106727560, + "step": 4948, + "time_per_iteration": 2.5894393920898438 + }, + { + "auxiliary_loss_clip": 0.01169289, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.05242014, + "balance_loss_mlp": 1.02004457, + "epoch": 0.5950820657728612, + "flos": 40547491476480.0, + "grad_norm": 1.9119158894215622, + "language_loss": 0.67768896, + "learning_rate": 1.4870033762019988e-06, + "loss": 0.69966519, + "num_input_tokens_seen": 106747725, + "step": 4949, + "time_per_iteration": 3.4546549320220947 + }, + { + "auxiliary_loss_clip": 0.01148257, + "auxiliary_loss_mlp": 0.01028029, + "balance_loss_clip": 1.05020618, + "balance_loss_mlp": 1.01979768, + "epoch": 0.5952023086635003, + "flos": 23184467884800.0, + "grad_norm": 1.8718595646818428, + "language_loss": 0.73295224, + "learning_rate": 1.4862505042310334e-06, + "loss": 0.75471509, + "num_input_tokens_seen": 106767010, + "step": 4950, + "time_per_iteration": 2.4868078231811523 + }, + { + "auxiliary_loss_clip": 0.01140951, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.04924345, + "balance_loss_mlp": 1.02516675, + "epoch": 0.5953225515541394, + "flos": 33653289548160.0, + "grad_norm": 2.7747454963618057, + "language_loss": 0.69862986, + "learning_rate": 1.4854977101954587e-06, + "loss": 0.72036242, + "num_input_tokens_seen": 106789230, + "step": 4951, + "time_per_iteration": 2.602067470550537 + }, + { + "auxiliary_loss_clip": 0.01163366, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.04802966, + "balance_loss_mlp": 1.02034664, + "epoch": 0.5954427944447784, + "flos": 24459619680000.0, + "grad_norm": 1.8566278373746716, + "language_loss": 0.86410409, + "learning_rate": 1.4847449942094716e-06, + "loss": 0.88602066, + "num_input_tokens_seen": 106808110, + "step": 4952, + "time_per_iteration": 2.5694382190704346 + }, + { + "auxiliary_loss_clip": 0.01145011, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.0497334, + "balance_loss_mlp": 1.01909447, + "epoch": 0.5955630373354175, + "flos": 18551848026240.0, + "grad_norm": 2.1879984913400503, + "language_loss": 0.8626225, + "learning_rate": 1.4839923563872598e-06, + "loss": 0.88434047, + "num_input_tokens_seen": 106826650, + "step": 4953, + "time_per_iteration": 3.299710750579834 + }, + { + "auxiliary_loss_clip": 0.01138145, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.05129373, + "balance_loss_mlp": 1.01990759, + "epoch": 0.5956832802260567, + "flos": 19791699730560.0, + "grad_norm": 2.0111740021682514, + "language_loss": 0.76011419, + "learning_rate": 1.483239796842997e-06, + "loss": 0.78177691, + "num_input_tokens_seen": 106844680, + "step": 4954, + "time_per_iteration": 2.4914398193359375 + }, + { + "auxiliary_loss_clip": 0.01135083, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.04863405, + "balance_loss_mlp": 1.02138078, + "epoch": 0.5958035231166957, + "flos": 19750868945280.0, + "grad_norm": 2.391598636700139, + "language_loss": 0.83933699, + "learning_rate": 1.4824873156908462e-06, + "loss": 0.86097634, + "num_input_tokens_seen": 106862605, + "step": 4955, + "time_per_iteration": 3.3377203941345215 + }, + { + "auxiliary_loss_clip": 0.01163082, + "auxiliary_loss_mlp": 0.00763065, + "balance_loss_clip": 1.0517801, + "balance_loss_mlp": 1.0003221, + "epoch": 0.5959237660073348, + "flos": 21652806090240.0, + "grad_norm": 1.6410390504623167, + "language_loss": 0.7558533, + "learning_rate": 1.4817349130449584e-06, + "loss": 0.77511477, + "num_input_tokens_seen": 106882325, + "step": 4956, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01160479, + "auxiliary_loss_mlp": 0.01024626, + "balance_loss_clip": 1.05075622, + "balance_loss_mlp": 1.01708627, + "epoch": 0.5960440088979739, + "flos": 21171207513600.0, + "grad_norm": 1.7988731833587812, + "language_loss": 0.83128268, + "learning_rate": 1.4809825890194717e-06, + "loss": 0.8531338, + "num_input_tokens_seen": 106900995, + "step": 4957, + "time_per_iteration": 3.168745756149292 + }, + { + "auxiliary_loss_clip": 0.01143143, + "auxiliary_loss_mlp": 0.01023334, + "balance_loss_clip": 1.0467366, + "balance_loss_mlp": 1.01525807, + "epoch": 0.596164251788613, + "flos": 14757526753920.0, + "grad_norm": 1.8143343458085588, + "language_loss": 0.77339411, + "learning_rate": 1.4802303437285139e-06, + "loss": 0.79505885, + "num_input_tokens_seen": 106918265, + "step": 4958, + "time_per_iteration": 2.451044797897339 + }, + { + "auxiliary_loss_clip": 0.01145635, + "auxiliary_loss_mlp": 0.01030751, + "balance_loss_clip": 1.0466224, + "balance_loss_mlp": 1.02278733, + "epoch": 0.596284494679252, + "flos": 20485924865280.0, + "grad_norm": 2.2989540994973168, + "language_loss": 0.8101123, + "learning_rate": 1.4794781772861994e-06, + "loss": 0.83187616, + "num_input_tokens_seen": 106934760, + "step": 4959, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01146987, + "auxiliary_loss_mlp": 0.00762551, + "balance_loss_clip": 1.04849386, + "balance_loss_mlp": 1.00034475, + "epoch": 0.5964047375698912, + "flos": 31212262108800.0, + "grad_norm": 2.0492302027120926, + "language_loss": 0.66953945, + "learning_rate": 1.4787260898066324e-06, + "loss": 0.68863487, + "num_input_tokens_seen": 106954760, + "step": 4960, + "time_per_iteration": 2.557249069213867 + }, + { + "auxiliary_loss_clip": 0.01176359, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.05356097, + "balance_loss_mlp": 1.02102649, + "epoch": 0.5965249804605303, + "flos": 27483620855040.0, + "grad_norm": 1.9676453578294528, + "language_loss": 0.85881627, + "learning_rate": 1.4779740814039023e-06, + "loss": 0.88086915, + "num_input_tokens_seen": 106974845, + "step": 4961, + "time_per_iteration": 2.4634146690368652 + }, + { + "auxiliary_loss_clip": 0.01175868, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.05139518, + "balance_loss_mlp": 1.02110434, + "epoch": 0.5966452233511693, + "flos": 30773936442240.0, + "grad_norm": 2.139889861102823, + "language_loss": 0.68651414, + "learning_rate": 1.4772221521920894e-06, + "loss": 0.70856833, + "num_input_tokens_seen": 106994870, + "step": 4962, + "time_per_iteration": 2.473191976547241 + }, + { + "auxiliary_loss_clip": 0.01152617, + "auxiliary_loss_mlp": 0.01025369, + "balance_loss_clip": 1.05465364, + "balance_loss_mlp": 1.01762605, + "epoch": 0.5967654662418085, + "flos": 25481170477440.0, + "grad_norm": 1.9149100296852453, + "language_loss": 0.74028611, + "learning_rate": 1.4764703022852598e-06, + "loss": 0.76206601, + "num_input_tokens_seen": 107015390, + "step": 4963, + "time_per_iteration": 2.5281455516815186 + }, + { + "auxiliary_loss_clip": 0.01096019, + "auxiliary_loss_mlp": 0.01023436, + "balance_loss_clip": 1.04384995, + "balance_loss_mlp": 1.01601553, + "epoch": 0.5968857091324475, + "flos": 19099126621440.0, + "grad_norm": 1.839949121094876, + "language_loss": 0.77111888, + "learning_rate": 1.4757185317974696e-06, + "loss": 0.79231346, + "num_input_tokens_seen": 107033775, + "step": 4964, + "time_per_iteration": 2.5622169971466064 + }, + { + "auxiliary_loss_clip": 0.01164558, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.05113244, + "balance_loss_mlp": 1.02014971, + "epoch": 0.5970059520230866, + "flos": 23692711374720.0, + "grad_norm": 2.331835218485058, + "language_loss": 0.70733988, + "learning_rate": 1.474966840842761e-06, + "loss": 0.72926784, + "num_input_tokens_seen": 107053355, + "step": 4965, + "time_per_iteration": 2.462054491043091 + }, + { + "auxiliary_loss_clip": 0.01167546, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.0524137, + "balance_loss_mlp": 1.01996338, + "epoch": 0.5971261949137258, + "flos": 23185545292800.0, + "grad_norm": 1.7218439076553258, + "language_loss": 0.86805224, + "learning_rate": 1.4742152295351655e-06, + "loss": 0.89000523, + "num_input_tokens_seen": 107072510, + "step": 4966, + "time_per_iteration": 2.4483320713043213 + }, + { + "auxiliary_loss_clip": 0.01163634, + "auxiliary_loss_mlp": 0.00762949, + "balance_loss_clip": 1.05017376, + "balance_loss_mlp": 1.00041175, + "epoch": 0.5972464378043648, + "flos": 20557710195840.0, + "grad_norm": 2.4383486074991283, + "language_loss": 0.64000165, + "learning_rate": 1.4734636979887016e-06, + "loss": 0.65926743, + "num_input_tokens_seen": 107089970, + "step": 4967, + "time_per_iteration": 2.4462952613830566 + }, + { + "auxiliary_loss_clip": 0.01138416, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.04806232, + "balance_loss_mlp": 1.02404666, + "epoch": 0.5973666806950039, + "flos": 29387030457600.0, + "grad_norm": 2.0295601572211126, + "language_loss": 0.90170622, + "learning_rate": 1.4727122463173755e-06, + "loss": 0.92341173, + "num_input_tokens_seen": 107108500, + "step": 4968, + "time_per_iteration": 2.5615103244781494 + }, + { + "auxiliary_loss_clip": 0.0114958, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.05158031, + "balance_loss_mlp": 1.01778495, + "epoch": 0.597486923585643, + "flos": 22273522041600.0, + "grad_norm": 1.7394379454645812, + "language_loss": 0.64354867, + "learning_rate": 1.471960874635183e-06, + "loss": 0.66529977, + "num_input_tokens_seen": 107128060, + "step": 4969, + "time_per_iteration": 2.4700706005096436 + }, + { + "auxiliary_loss_clip": 0.01145599, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.04819679, + "balance_loss_mlp": 1.02221036, + "epoch": 0.5976071664762821, + "flos": 13772461196160.0, + "grad_norm": 2.187790786830356, + "language_loss": 0.70690691, + "learning_rate": 1.4712095830561055e-06, + "loss": 0.72866654, + "num_input_tokens_seen": 107146550, + "step": 4970, + "time_per_iteration": 2.4548113346099854 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.0471102, + "balance_loss_mlp": 1.02043581, + "epoch": 0.5977274093669211, + "flos": 19098623831040.0, + "grad_norm": 1.8106387547572447, + "language_loss": 0.80656016, + "learning_rate": 1.4704583716941147e-06, + "loss": 0.8283121, + "num_input_tokens_seen": 107165415, + "step": 4971, + "time_per_iteration": 2.4699997901916504 + }, + { + "auxiliary_loss_clip": 0.01156526, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.05182981, + "balance_loss_mlp": 1.02576685, + "epoch": 0.5978476522575603, + "flos": 20376002269440.0, + "grad_norm": 2.1702936540877924, + "language_loss": 0.72501111, + "learning_rate": 1.4697072406631672e-06, + "loss": 0.74691004, + "num_input_tokens_seen": 107185320, + "step": 4972, + "time_per_iteration": 2.4371893405914307 + }, + { + "auxiliary_loss_clip": 0.01125817, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.05018544, + "balance_loss_mlp": 1.02584517, + "epoch": 0.5979678951481994, + "flos": 29023147728000.0, + "grad_norm": 1.588791497033447, + "language_loss": 0.72839105, + "learning_rate": 1.4689561900772097e-06, + "loss": 0.74999613, + "num_input_tokens_seen": 107205380, + "step": 4973, + "time_per_iteration": 2.6014435291290283 + }, + { + "auxiliary_loss_clip": 0.01146675, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.04700828, + "balance_loss_mlp": 1.02213907, + "epoch": 0.5980881380388384, + "flos": 17967689141760.0, + "grad_norm": 2.0716523326837435, + "language_loss": 0.72641158, + "learning_rate": 1.4682052200501758e-06, + "loss": 0.74817407, + "num_input_tokens_seen": 107222585, + "step": 4974, + "time_per_iteration": 2.4566664695739746 + }, + { + "auxiliary_loss_clip": 0.01176278, + "auxiliary_loss_mlp": 0.01027127, + "balance_loss_clip": 1.05194044, + "balance_loss_mlp": 1.01902044, + "epoch": 0.5982083809294776, + "flos": 22962827013120.0, + "grad_norm": 1.8513017249803914, + "language_loss": 0.79629666, + "learning_rate": 1.4674543306959876e-06, + "loss": 0.81833076, + "num_input_tokens_seen": 107242055, + "step": 4975, + "time_per_iteration": 2.4262027740478516 + }, + { + "auxiliary_loss_clip": 0.01156405, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.05264425, + "balance_loss_mlp": 1.02031815, + "epoch": 0.5983286238201166, + "flos": 20991941712000.0, + "grad_norm": 2.3764702166331664, + "language_loss": 0.84969616, + "learning_rate": 1.4667035221285535e-06, + "loss": 0.87154835, + "num_input_tokens_seen": 107259695, + "step": 4976, + "time_per_iteration": 3.314951181411743 + }, + { + "auxiliary_loss_clip": 0.01161551, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.05235291, + "balance_loss_mlp": 1.0192008, + "epoch": 0.5984488667107557, + "flos": 28183448511360.0, + "grad_norm": 1.9596747127444862, + "language_loss": 0.74235994, + "learning_rate": 1.4659527944617715e-06, + "loss": 0.76424545, + "num_input_tokens_seen": 107279640, + "step": 4977, + "time_per_iteration": 2.49161958694458 + }, + { + "auxiliary_loss_clip": 0.01101519, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.04230893, + "balance_loss_mlp": 1.018929, + "epoch": 0.5985691096013949, + "flos": 16471794314880.0, + "grad_norm": 1.8085265572665044, + "language_loss": 0.75897062, + "learning_rate": 1.465202147809526e-06, + "loss": 0.78025389, + "num_input_tokens_seen": 107298135, + "step": 4978, + "time_per_iteration": 2.5493552684783936 + }, + { + "auxiliary_loss_clip": 0.01177732, + "auxiliary_loss_mlp": 0.01026295, + "balance_loss_clip": 1.05335546, + "balance_loss_mlp": 1.01910388, + "epoch": 0.5986893524920339, + "flos": 26719046933760.0, + "grad_norm": 2.0609474560671837, + "language_loss": 0.76020217, + "learning_rate": 1.4644515822856888e-06, + "loss": 0.78224242, + "num_input_tokens_seen": 107316570, + "step": 4979, + "time_per_iteration": 3.2906670570373535 + }, + { + "auxiliary_loss_clip": 0.01042761, + "auxiliary_loss_mlp": 0.01002341, + "balance_loss_clip": 1.01489282, + "balance_loss_mlp": 1.00126767, + "epoch": 0.598809595382673, + "flos": 61608061100160.0, + "grad_norm": 0.9209046664704985, + "language_loss": 0.5651983, + "learning_rate": 1.4637010980041215e-06, + "loss": 0.58564937, + "num_input_tokens_seen": 107378680, + "step": 4980, + "time_per_iteration": 3.1087265014648438 + }, + { + "auxiliary_loss_clip": 0.01178441, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.05274665, + "balance_loss_mlp": 1.02085042, + "epoch": 0.5989298382733121, + "flos": 11801719549440.0, + "grad_norm": 2.3279182921558594, + "language_loss": 0.89423263, + "learning_rate": 1.4629506950786707e-06, + "loss": 0.91630769, + "num_input_tokens_seen": 107394860, + "step": 4981, + "time_per_iteration": 3.2508251667022705 + }, + { + "auxiliary_loss_clip": 0.01073633, + "auxiliary_loss_mlp": 0.01000602, + "balance_loss_clip": 1.01515174, + "balance_loss_mlp": 0.99955332, + "epoch": 0.5990500811639512, + "flos": 60025800021120.0, + "grad_norm": 0.8080336682060275, + "language_loss": 0.56101525, + "learning_rate": 1.4622003736231733e-06, + "loss": 0.58175755, + "num_input_tokens_seen": 107453850, + "step": 4982, + "time_per_iteration": 3.0760984420776367 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.05302715, + "balance_loss_mlp": 1.01946187, + "epoch": 0.5991703240545903, + "flos": 18222726683520.0, + "grad_norm": 2.3769213074901163, + "language_loss": 0.804515, + "learning_rate": 1.461450133751451e-06, + "loss": 0.8264299, + "num_input_tokens_seen": 107471920, + "step": 4983, + "time_per_iteration": 2.456864833831787 + }, + { + "auxiliary_loss_clip": 0.01167607, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.05290306, + "balance_loss_mlp": 1.02141893, + "epoch": 0.5992905669452293, + "flos": 27709894581120.0, + "grad_norm": 1.8922090337392568, + "language_loss": 0.76022387, + "learning_rate": 1.4606999755773153e-06, + "loss": 0.78219581, + "num_input_tokens_seen": 107493125, + "step": 4984, + "time_per_iteration": 3.260646343231201 + }, + { + "auxiliary_loss_clip": 0.01176605, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.05327678, + "balance_loss_mlp": 1.02060187, + "epoch": 0.5994108098358685, + "flos": 20449008662400.0, + "grad_norm": 1.97051652426096, + "language_loss": 0.821702, + "learning_rate": 1.4599498992145643e-06, + "loss": 0.8437537, + "num_input_tokens_seen": 107513150, + "step": 4985, + "time_per_iteration": 2.4697866439819336 + }, + { + "auxiliary_loss_clip": 0.01155634, + "auxiliary_loss_mlp": 0.00762361, + "balance_loss_clip": 1.05148196, + "balance_loss_mlp": 1.00047624, + "epoch": 0.5995310527265075, + "flos": 22269966595200.0, + "grad_norm": 1.8964918959056367, + "language_loss": 0.70976949, + "learning_rate": 1.4591999047769846e-06, + "loss": 0.72894943, + "num_input_tokens_seen": 107532005, + "step": 4986, + "time_per_iteration": 2.499343156814575 + }, + { + "auxiliary_loss_clip": 0.01102257, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.0239768, + "epoch": 0.5996512956171466, + "flos": 18916951818240.0, + "grad_norm": 1.6569863288346862, + "language_loss": 0.75477016, + "learning_rate": 1.4584499923783486e-06, + "loss": 0.77611834, + "num_input_tokens_seen": 107550585, + "step": 4987, + "time_per_iteration": 2.5597124099731445 + }, + { + "auxiliary_loss_clip": 0.01146626, + "auxiliary_loss_mlp": 0.01022737, + "balance_loss_clip": 1.04883158, + "balance_loss_mlp": 1.01527464, + "epoch": 0.5997715385077858, + "flos": 15370916330880.0, + "grad_norm": 1.7460331297603642, + "language_loss": 0.76018357, + "learning_rate": 1.457700162132419e-06, + "loss": 0.78187728, + "num_input_tokens_seen": 107567575, + "step": 4988, + "time_per_iteration": 2.455214262008667 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.04639053, + "balance_loss_mlp": 1.01710546, + "epoch": 0.5998917813984248, + "flos": 25264844818560.0, + "grad_norm": 2.5868413614548573, + "language_loss": 0.72516936, + "learning_rate": 1.4569504141529433e-06, + "loss": 0.74657941, + "num_input_tokens_seen": 107585410, + "step": 4989, + "time_per_iteration": 2.582700252532959 + }, + { + "auxiliary_loss_clip": 0.01165248, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.05479109, + "balance_loss_mlp": 1.02441216, + "epoch": 0.6000120242890639, + "flos": 22054502862720.0, + "grad_norm": 1.9744354764354737, + "language_loss": 0.72019613, + "learning_rate": 1.456200748553658e-06, + "loss": 0.74217606, + "num_input_tokens_seen": 107603405, + "step": 4990, + "time_per_iteration": 2.449049472808838 + }, + { + "auxiliary_loss_clip": 0.01179614, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.0533433, + "balance_loss_mlp": 1.02327228, + "epoch": 0.600132267179703, + "flos": 29863421562240.0, + "grad_norm": 1.774715116444949, + "language_loss": 0.78607428, + "learning_rate": 1.455451165448287e-06, + "loss": 0.80818862, + "num_input_tokens_seen": 107626060, + "step": 4991, + "time_per_iteration": 2.491271734237671 + }, + { + "auxiliary_loss_clip": 0.01146083, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.05040836, + "balance_loss_mlp": 1.01975465, + "epoch": 0.6002525100703421, + "flos": 25045358762880.0, + "grad_norm": 2.1315986861374623, + "language_loss": 0.73357773, + "learning_rate": 1.4547016649505407e-06, + "loss": 0.75531662, + "num_input_tokens_seen": 107644070, + "step": 4992, + "time_per_iteration": 2.5270144939422607 + }, + { + "auxiliary_loss_clip": 0.01133068, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.01486278, + "epoch": 0.6003727529609811, + "flos": 20849592113280.0, + "grad_norm": 2.0925473028567425, + "language_loss": 0.84769738, + "learning_rate": 1.4539522471741193e-06, + "loss": 0.86925995, + "num_input_tokens_seen": 107661495, + "step": 4993, + "time_per_iteration": 2.575850248336792 + }, + { + "auxiliary_loss_clip": 0.01165259, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.04939103, + "balance_loss_mlp": 1.02135623, + "epoch": 0.6004929958516203, + "flos": 15594604277760.0, + "grad_norm": 2.0274479076688436, + "language_loss": 0.71188104, + "learning_rate": 1.4532029122327067e-06, + "loss": 0.73383111, + "num_input_tokens_seen": 107678280, + "step": 4994, + "time_per_iteration": 2.423466682434082 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.05140328, + "balance_loss_mlp": 1.02209139, + "epoch": 0.6006132387422594, + "flos": 21763267390080.0, + "grad_norm": 1.9483110586840606, + "language_loss": 0.75176615, + "learning_rate": 1.4524536602399783e-06, + "loss": 0.77338433, + "num_input_tokens_seen": 107697370, + "step": 4995, + "time_per_iteration": 2.5061392784118652 + }, + { + "auxiliary_loss_clip": 0.01143593, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.04965007, + "balance_loss_mlp": 1.02633905, + "epoch": 0.6007334816328984, + "flos": 22858542852480.0, + "grad_norm": 1.575291398430298, + "language_loss": 0.77695823, + "learning_rate": 1.4517044913095938e-06, + "loss": 0.79873389, + "num_input_tokens_seen": 107717790, + "step": 4996, + "time_per_iteration": 2.5041141510009766 + }, + { + "auxiliary_loss_clip": 0.01163615, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.05148876, + "balance_loss_mlp": 1.0180068, + "epoch": 0.6008537245235376, + "flos": 28324577047680.0, + "grad_norm": 2.175484317456789, + "language_loss": 0.81437993, + "learning_rate": 1.4509554055552022e-06, + "loss": 0.83627903, + "num_input_tokens_seen": 107738020, + "step": 4997, + "time_per_iteration": 2.5001513957977295 + }, + { + "auxiliary_loss_clip": 0.0114588, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.04908776, + "balance_loss_mlp": 1.02102661, + "epoch": 0.6009739674141766, + "flos": 20886113266560.0, + "grad_norm": 3.6005132846392973, + "language_loss": 0.83950353, + "learning_rate": 1.450206403090439e-06, + "loss": 0.86125308, + "num_input_tokens_seen": 107756215, + "step": 4998, + "time_per_iteration": 2.4637374877929688 + }, + { + "auxiliary_loss_clip": 0.01163042, + "auxiliary_loss_mlp": 0.01024263, + "balance_loss_clip": 1.05340576, + "balance_loss_mlp": 1.01642537, + "epoch": 0.6010942103048157, + "flos": 20481004702080.0, + "grad_norm": 2.753614681859002, + "language_loss": 0.86094439, + "learning_rate": 1.4494574840289274e-06, + "loss": 0.88281751, + "num_input_tokens_seen": 107773330, + "step": 4999, + "time_per_iteration": 2.4368371963500977 + }, + { + "auxiliary_loss_clip": 0.01166374, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.04971433, + "balance_loss_mlp": 1.02477598, + "epoch": 0.6012144531954549, + "flos": 23805973935360.0, + "grad_norm": 5.324664305094547, + "language_loss": 0.73814762, + "learning_rate": 1.4487086484842782e-06, + "loss": 0.76014602, + "num_input_tokens_seen": 107791975, + "step": 5000, + "time_per_iteration": 2.4726297855377197 + }, + { + "auxiliary_loss_clip": 0.01171895, + "auxiliary_loss_mlp": 0.0102602, + "balance_loss_clip": 1.04896939, + "balance_loss_mlp": 1.01844716, + "epoch": 0.6013346960860939, + "flos": 18988378012800.0, + "grad_norm": 2.252309271280477, + "language_loss": 0.60172224, + "learning_rate": 1.4479598965700878e-06, + "loss": 0.62370133, + "num_input_tokens_seen": 107809240, + "step": 5001, + "time_per_iteration": 2.3883323669433594 + }, + { + "auxiliary_loss_clip": 0.01132745, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.04567719, + "balance_loss_mlp": 1.01950622, + "epoch": 0.601454938976733, + "flos": 24025316336640.0, + "grad_norm": 2.272489443577085, + "language_loss": 0.68891734, + "learning_rate": 1.4472112283999427e-06, + "loss": 0.71051997, + "num_input_tokens_seen": 107827895, + "step": 5002, + "time_per_iteration": 3.3464436531066895 + }, + { + "auxiliary_loss_clip": 0.01160421, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.05323637, + "balance_loss_mlp": 1.0249815, + "epoch": 0.6015751818673721, + "flos": 26427129102720.0, + "grad_norm": 2.0363594406857852, + "language_loss": 0.6939925, + "learning_rate": 1.4464626440874143e-06, + "loss": 0.71592242, + "num_input_tokens_seen": 107847010, + "step": 5003, + "time_per_iteration": 2.494530439376831 + }, + { + "auxiliary_loss_clip": 0.01126397, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.04351139, + "balance_loss_mlp": 1.02006209, + "epoch": 0.6016954247580112, + "flos": 13115260005120.0, + "grad_norm": 2.402330950953752, + "language_loss": 0.74155611, + "learning_rate": 1.4457141437460636e-06, + "loss": 0.7631036, + "num_input_tokens_seen": 107864235, + "step": 5004, + "time_per_iteration": 2.50895619392395 + }, + { + "auxiliary_loss_clip": 0.01149278, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.04833245, + "balance_loss_mlp": 1.02110839, + "epoch": 0.6018156676486502, + "flos": 23768447201280.0, + "grad_norm": 1.7299193761026348, + "language_loss": 0.73381329, + "learning_rate": 1.444965727489436e-06, + "loss": 0.7556017, + "num_input_tokens_seen": 107883680, + "step": 5005, + "time_per_iteration": 3.347539186477661 + }, + { + "auxiliary_loss_clip": 0.01130165, + "auxiliary_loss_mlp": 0.01029062, + "balance_loss_clip": 1.04347682, + "balance_loss_mlp": 1.02115822, + "epoch": 0.6019359105392894, + "flos": 26469360518400.0, + "grad_norm": 9.282858128044536, + "language_loss": 0.63305753, + "learning_rate": 1.444217395431066e-06, + "loss": 0.65464979, + "num_input_tokens_seen": 107906220, + "step": 5006, + "time_per_iteration": 2.5798964500427246 + }, + { + "auxiliary_loss_clip": 0.01038194, + "auxiliary_loss_mlp": 0.01005458, + "balance_loss_clip": 1.01464319, + "balance_loss_mlp": 1.00442088, + "epoch": 0.6020561534299285, + "flos": 69190849728000.0, + "grad_norm": 0.8039692586969128, + "language_loss": 0.55863523, + "learning_rate": 1.4434691476844755e-06, + "loss": 0.57907176, + "num_input_tokens_seen": 107967195, + "step": 5007, + "time_per_iteration": 3.0605690479278564 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.04916382, + "balance_loss_mlp": 1.0197922, + "epoch": 0.6021763963205675, + "flos": 21835304115840.0, + "grad_norm": 2.123492861379607, + "language_loss": 0.6677469, + "learning_rate": 1.4427209843631729e-06, + "loss": 0.68944573, + "num_input_tokens_seen": 107984245, + "step": 5008, + "time_per_iteration": 3.344230890274048 + }, + { + "auxiliary_loss_clip": 0.01175014, + "auxiliary_loss_mlp": 0.00762186, + "balance_loss_clip": 1.05259132, + "balance_loss_mlp": 1.0005331, + "epoch": 0.6022966392112067, + "flos": 26578636669440.0, + "grad_norm": 1.83676117665767, + "language_loss": 0.81273806, + "learning_rate": 1.4419729055806534e-06, + "loss": 0.83211005, + "num_input_tokens_seen": 108003680, + "step": 5009, + "time_per_iteration": 2.489734172821045 + }, + { + "auxiliary_loss_clip": 0.01144772, + "auxiliary_loss_mlp": 0.00762106, + "balance_loss_clip": 1.05130374, + "balance_loss_mlp": 1.0005089, + "epoch": 0.6024168821018457, + "flos": 20703722981760.0, + "grad_norm": 2.1334655888156724, + "language_loss": 0.82075316, + "learning_rate": 1.441224911450401e-06, + "loss": 0.83982193, + "num_input_tokens_seen": 108019635, + "step": 5010, + "time_per_iteration": 3.2493977546691895 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.05022347, + "balance_loss_mlp": 1.02731037, + "epoch": 0.6025371249924848, + "flos": 24680973242880.0, + "grad_norm": 2.7722133494753143, + "language_loss": 0.82099724, + "learning_rate": 1.4404770020858851e-06, + "loss": 0.84301722, + "num_input_tokens_seen": 108039120, + "step": 5011, + "time_per_iteration": 2.47942852973938 + }, + { + "auxiliary_loss_clip": 0.0115398, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.0478797, + "balance_loss_mlp": 1.02033556, + "epoch": 0.602657367883124, + "flos": 25955801815680.0, + "grad_norm": 1.59993587240379, + "language_loss": 0.85915565, + "learning_rate": 1.439729177600563e-06, + "loss": 0.88097489, + "num_input_tokens_seen": 108059615, + "step": 5012, + "time_per_iteration": 2.4997735023498535 + }, + { + "auxiliary_loss_clip": 0.01137371, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.04735422, + "balance_loss_mlp": 1.01887453, + "epoch": 0.602777610773763, + "flos": 16690633925760.0, + "grad_norm": 2.0473062333553855, + "language_loss": 0.73011851, + "learning_rate": 1.4389814381078793e-06, + "loss": 0.75176024, + "num_input_tokens_seen": 108078855, + "step": 5013, + "time_per_iteration": 2.539184331893921 + }, + { + "auxiliary_loss_clip": 0.01087009, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.0457232, + "balance_loss_mlp": 1.02707553, + "epoch": 0.6028978536644021, + "flos": 13334243270400.0, + "grad_norm": 3.7071607608245536, + "language_loss": 0.8010776, + "learning_rate": 1.438233783721265e-06, + "loss": 0.82229185, + "num_input_tokens_seen": 108095020, + "step": 5014, + "time_per_iteration": 2.580188035964966 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01026996, + "balance_loss_clip": 1.05526173, + "balance_loss_mlp": 1.01934278, + "epoch": 0.6030180965550412, + "flos": 19644825018240.0, + "grad_norm": 1.9904394804667211, + "language_loss": 0.77557641, + "learning_rate": 1.43748621455414e-06, + "loss": 0.79733133, + "num_input_tokens_seen": 108111455, + "step": 5015, + "time_per_iteration": 2.4913110733032227 + }, + { + "auxiliary_loss_clip": 0.01144282, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.04798412, + "balance_loss_mlp": 1.02138174, + "epoch": 0.6031383394456803, + "flos": 14458390289280.0, + "grad_norm": 2.5235556294668817, + "language_loss": 0.80881631, + "learning_rate": 1.4367387307199082e-06, + "loss": 0.83055246, + "num_input_tokens_seen": 108128305, + "step": 5016, + "time_per_iteration": 2.4549930095672607 + }, + { + "auxiliary_loss_clip": 0.01156434, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.0478915, + "balance_loss_mlp": 1.02269471, + "epoch": 0.6032585823363193, + "flos": 13917791623680.0, + "grad_norm": 2.02468675669203, + "language_loss": 0.82572615, + "learning_rate": 1.4359913323319632e-06, + "loss": 0.84759456, + "num_input_tokens_seen": 108145475, + "step": 5017, + "time_per_iteration": 2.4183881282806396 + }, + { + "auxiliary_loss_clip": 0.01092142, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.04136455, + "balance_loss_mlp": 1.01943231, + "epoch": 0.6033788252269584, + "flos": 24353252530560.0, + "grad_norm": 1.5656940464598688, + "language_loss": 0.77422225, + "learning_rate": 1.4352440195036847e-06, + "loss": 0.79541701, + "num_input_tokens_seen": 108165650, + "step": 5018, + "time_per_iteration": 2.7103166580200195 + }, + { + "auxiliary_loss_clip": 0.01096191, + "auxiliary_loss_mlp": 0.01024006, + "balance_loss_clip": 1.04015517, + "balance_loss_mlp": 1.01623929, + "epoch": 0.6034990681175976, + "flos": 25521247077120.0, + "grad_norm": 2.2470337703071093, + "language_loss": 0.79701352, + "learning_rate": 1.4344967923484395e-06, + "loss": 0.81821549, + "num_input_tokens_seen": 108187620, + "step": 5019, + "time_per_iteration": 2.645002603530884 + }, + { + "auxiliary_loss_clip": 0.01157419, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.0487802, + "balance_loss_mlp": 1.02129126, + "epoch": 0.6036193110082366, + "flos": 25958387594880.0, + "grad_norm": 2.1305302587415147, + "language_loss": 0.72435927, + "learning_rate": 1.433749650979581e-06, + "loss": 0.74622297, + "num_input_tokens_seen": 108207605, + "step": 5020, + "time_per_iteration": 2.5045955181121826 + }, + { + "auxiliary_loss_clip": 0.01137612, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.04668975, + "balance_loss_mlp": 1.02015305, + "epoch": 0.6037395538988757, + "flos": 25593427457280.0, + "grad_norm": 1.7935822766322709, + "language_loss": 0.67929846, + "learning_rate": 1.433002595510451e-06, + "loss": 0.70095348, + "num_input_tokens_seen": 108226385, + "step": 5021, + "time_per_iteration": 2.550509452819824 + }, + { + "auxiliary_loss_clip": 0.01142626, + "auxiliary_loss_mlp": 0.00763046, + "balance_loss_clip": 1.04645419, + "balance_loss_mlp": 1.00054348, + "epoch": 0.6038597967895148, + "flos": 17816253402240.0, + "grad_norm": 1.8841670330695506, + "language_loss": 0.72065091, + "learning_rate": 1.4322556260543757e-06, + "loss": 0.73970765, + "num_input_tokens_seen": 108242960, + "step": 5022, + "time_per_iteration": 2.4696199893951416 + }, + { + "auxiliary_loss_clip": 0.01042647, + "auxiliary_loss_mlp": 0.01004339, + "balance_loss_clip": 1.01380968, + "balance_loss_mlp": 1.00323629, + "epoch": 0.6039800396801539, + "flos": 65169213235200.0, + "grad_norm": 0.9004752528451603, + "language_loss": 0.62714118, + "learning_rate": 1.4315087427246703e-06, + "loss": 0.64761102, + "num_input_tokens_seen": 108296785, + "step": 5023, + "time_per_iteration": 2.988929271697998 + }, + { + "auxiliary_loss_clip": 0.01074203, + "auxiliary_loss_mlp": 0.01001746, + "balance_loss_clip": 1.01607776, + "balance_loss_mlp": 1.00070286, + "epoch": 0.604100282570793, + "flos": 67386409073280.0, + "grad_norm": 0.8659566953079477, + "language_loss": 0.58467013, + "learning_rate": 1.4307619456346372e-06, + "loss": 0.60542959, + "num_input_tokens_seen": 108341090, + "step": 5024, + "time_per_iteration": 2.72853422164917 + }, + { + "auxiliary_loss_clip": 0.01162675, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.04793739, + "balance_loss_mlp": 1.01937151, + "epoch": 0.6042205254614321, + "flos": 35297495631360.0, + "grad_norm": 2.4484646796305656, + "language_loss": 0.74530387, + "learning_rate": 1.430015234897564e-06, + "loss": 0.76720595, + "num_input_tokens_seen": 108364370, + "step": 5025, + "time_per_iteration": 2.5868020057678223 + }, + { + "auxiliary_loss_clip": 0.01175461, + "auxiliary_loss_mlp": 0.00762733, + "balance_loss_clip": 1.05122185, + "balance_loss_mlp": 1.00058103, + "epoch": 0.6043407683520712, + "flos": 45658262206080.0, + "grad_norm": 1.9333595655394096, + "language_loss": 0.66328311, + "learning_rate": 1.4292686106267274e-06, + "loss": 0.68266499, + "num_input_tokens_seen": 108387220, + "step": 5026, + "time_per_iteration": 2.66455340385437 + }, + { + "auxiliary_loss_clip": 0.01165658, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.050825, + "balance_loss_mlp": 1.02364135, + "epoch": 0.6044610112427102, + "flos": 16180020138240.0, + "grad_norm": 1.893756904251137, + "language_loss": 0.77115452, + "learning_rate": 1.4285220729353876e-06, + "loss": 0.79312837, + "num_input_tokens_seen": 108405760, + "step": 5027, + "time_per_iteration": 2.4467709064483643 + }, + { + "auxiliary_loss_clip": 0.01143584, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.04542017, + "balance_loss_mlp": 1.01868415, + "epoch": 0.6045812541333494, + "flos": 13804062186240.0, + "grad_norm": 3.590940819406141, + "language_loss": 0.78105903, + "learning_rate": 1.4277756219367957e-06, + "loss": 0.8027606, + "num_input_tokens_seen": 108422785, + "step": 5028, + "time_per_iteration": 2.458397626876831 + }, + { + "auxiliary_loss_clip": 0.01141256, + "auxiliary_loss_mlp": 0.01025115, + "balance_loss_clip": 1.04836535, + "balance_loss_mlp": 1.01712227, + "epoch": 0.6047014970239885, + "flos": 19975059682560.0, + "grad_norm": 2.1083199103529418, + "language_loss": 0.79953861, + "learning_rate": 1.4270292577441864e-06, + "loss": 0.82120228, + "num_input_tokens_seen": 108442290, + "step": 5029, + "time_per_iteration": 3.335294008255005 + }, + { + "auxiliary_loss_clip": 0.01163883, + "auxiliary_loss_mlp": 0.01026311, + "balance_loss_clip": 1.04761863, + "balance_loss_mlp": 1.01827013, + "epoch": 0.6048217399146275, + "flos": 25337097025920.0, + "grad_norm": 1.554436458152638, + "language_loss": 0.71815109, + "learning_rate": 1.4262829804707836e-06, + "loss": 0.74005306, + "num_input_tokens_seen": 108464280, + "step": 5030, + "time_per_iteration": 2.478785514831543 + }, + { + "auxiliary_loss_clip": 0.01162765, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.04754078, + "balance_loss_mlp": 1.01990485, + "epoch": 0.6049419828052667, + "flos": 26030819370240.0, + "grad_norm": 1.5380958489320469, + "language_loss": 0.69692987, + "learning_rate": 1.4255367902297958e-06, + "loss": 0.71883845, + "num_input_tokens_seen": 108485610, + "step": 5031, + "time_per_iteration": 2.4898414611816406 + }, + { + "auxiliary_loss_clip": 0.01172529, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.05104756, + "balance_loss_mlp": 1.0216012, + "epoch": 0.6050622256959057, + "flos": 14648106948480.0, + "grad_norm": 2.0205071211404957, + "language_loss": 0.78258532, + "learning_rate": 1.4247906871344215e-06, + "loss": 0.8046006, + "num_input_tokens_seen": 108501005, + "step": 5032, + "time_per_iteration": 3.2430365085601807 + }, + { + "auxiliary_loss_clip": 0.01138409, + "auxiliary_loss_mlp": 0.01021865, + "balance_loss_clip": 1.04324579, + "balance_loss_mlp": 1.01417613, + "epoch": 0.6051824685865448, + "flos": 23331450337920.0, + "grad_norm": 1.9961447986959235, + "language_loss": 0.75440133, + "learning_rate": 1.4240446712978415e-06, + "loss": 0.77600408, + "num_input_tokens_seen": 108519990, + "step": 5033, + "time_per_iteration": 2.5093581676483154 + }, + { + "auxiliary_loss_clip": 0.01166141, + "auxiliary_loss_mlp": 0.01023443, + "balance_loss_clip": 1.05149388, + "balance_loss_mlp": 1.01497316, + "epoch": 0.605302711477184, + "flos": 27563307177600.0, + "grad_norm": 1.944082708811842, + "language_loss": 0.74390268, + "learning_rate": 1.423298742833227e-06, + "loss": 0.76579857, + "num_input_tokens_seen": 108538650, + "step": 5034, + "time_per_iteration": 3.369309902191162 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.04437637, + "balance_loss_mlp": 1.02083015, + "epoch": 0.605422954367823, + "flos": 15154698412800.0, + "grad_norm": 1.9796431019349092, + "language_loss": 0.71518171, + "learning_rate": 1.4225529018537352e-06, + "loss": 0.73682868, + "num_input_tokens_seen": 108554155, + "step": 5035, + "time_per_iteration": 2.485997438430786 + }, + { + "auxiliary_loss_clip": 0.01174463, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.05160427, + "balance_loss_mlp": 1.0192349, + "epoch": 0.6055431972584621, + "flos": 27673912131840.0, + "grad_norm": 1.5391614634909567, + "language_loss": 0.77641916, + "learning_rate": 1.4218071484725082e-06, + "loss": 0.79843432, + "num_input_tokens_seen": 108576275, + "step": 5036, + "time_per_iteration": 3.2165653705596924 + }, + { + "auxiliary_loss_clip": 0.01143802, + "auxiliary_loss_mlp": 0.01033921, + "balance_loss_clip": 1.0498203, + "balance_loss_mlp": 1.0262202, + "epoch": 0.6056634401491012, + "flos": 19387489006080.0, + "grad_norm": 1.939184240632925, + "language_loss": 0.76289034, + "learning_rate": 1.4210614828026786e-06, + "loss": 0.78466761, + "num_input_tokens_seen": 108594125, + "step": 5037, + "time_per_iteration": 2.4745450019836426 + }, + { + "auxiliary_loss_clip": 0.01173516, + "auxiliary_loss_mlp": 0.01021558, + "balance_loss_clip": 1.05024421, + "balance_loss_mlp": 1.01408923, + "epoch": 0.6057836830397403, + "flos": 24789459294720.0, + "grad_norm": 1.7997925081321913, + "language_loss": 0.74395478, + "learning_rate": 1.4203159049573605e-06, + "loss": 0.7659055, + "num_input_tokens_seen": 108615360, + "step": 5038, + "time_per_iteration": 2.4586880207061768 + }, + { + "auxiliary_loss_clip": 0.01153919, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.0476861, + "balance_loss_mlp": 1.02069545, + "epoch": 0.6059039259303793, + "flos": 20558248899840.0, + "grad_norm": 2.0111601639072414, + "language_loss": 0.86556637, + "learning_rate": 1.4195704150496593e-06, + "loss": 0.88739151, + "num_input_tokens_seen": 108633075, + "step": 5039, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01146098, + "auxiliary_loss_mlp": 0.01025455, + "balance_loss_clip": 1.04911399, + "balance_loss_mlp": 1.0177238, + "epoch": 0.6060241688210185, + "flos": 21069724613760.0, + "grad_norm": 1.6540863857803458, + "language_loss": 0.73883128, + "learning_rate": 1.4188250131926639e-06, + "loss": 0.7605468, + "num_input_tokens_seen": 108651875, + "step": 5040, + "time_per_iteration": 2.480175018310547 + }, + { + "auxiliary_loss_clip": 0.01148466, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.04756987, + "balance_loss_mlp": 1.02094769, + "epoch": 0.6061444117116576, + "flos": 16361081619840.0, + "grad_norm": 1.9325676538841574, + "language_loss": 0.80827814, + "learning_rate": 1.4180796994994525e-06, + "loss": 0.83005571, + "num_input_tokens_seen": 108669290, + "step": 5041, + "time_per_iteration": 2.4469096660614014 + }, + { + "auxiliary_loss_clip": 0.01142959, + "auxiliary_loss_mlp": 0.01021449, + "balance_loss_clip": 1.04533744, + "balance_loss_mlp": 1.01383805, + "epoch": 0.6062646546022966, + "flos": 21507296094720.0, + "grad_norm": 1.7463168202069994, + "language_loss": 0.72617656, + "learning_rate": 1.4173344740830877e-06, + "loss": 0.74782062, + "num_input_tokens_seen": 108688420, + "step": 5042, + "time_per_iteration": 2.480771780014038 + }, + { + "auxiliary_loss_clip": 0.01145253, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.05302942, + "balance_loss_mlp": 1.0237143, + "epoch": 0.6063848974929358, + "flos": 38983151283840.0, + "grad_norm": 1.9515909128149715, + "language_loss": 0.70632088, + "learning_rate": 1.4165893370566206e-06, + "loss": 0.72809023, + "num_input_tokens_seen": 108712175, + "step": 5043, + "time_per_iteration": 2.6273722648620605 + }, + { + "auxiliary_loss_clip": 0.01154709, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.04581594, + "balance_loss_mlp": 1.02180684, + "epoch": 0.6065051403835748, + "flos": 19646584784640.0, + "grad_norm": 1.6974091260036241, + "language_loss": 0.77468133, + "learning_rate": 1.4158442885330865e-06, + "loss": 0.79652762, + "num_input_tokens_seen": 108730745, + "step": 5044, + "time_per_iteration": 2.4389431476593018 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.04661655, + "balance_loss_mlp": 1.02289975, + "epoch": 0.6066253832742139, + "flos": 23513086437120.0, + "grad_norm": 1.9480080935240414, + "language_loss": 0.78942889, + "learning_rate": 1.4150993286255094e-06, + "loss": 0.81127989, + "num_input_tokens_seen": 108749995, + "step": 5045, + "time_per_iteration": 2.467611074447632 + }, + { + "auxiliary_loss_clip": 0.0117245, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.04905248, + "balance_loss_mlp": 1.01823974, + "epoch": 0.6067456261648531, + "flos": 19133708440320.0, + "grad_norm": 2.471559180399533, + "language_loss": 0.79588759, + "learning_rate": 1.4143544574468993e-06, + "loss": 0.81786942, + "num_input_tokens_seen": 108768355, + "step": 5046, + "time_per_iteration": 2.4121909141540527 + }, + { + "auxiliary_loss_clip": 0.01158391, + "auxiliary_loss_mlp": 0.0102485, + "balance_loss_clip": 1.04983783, + "balance_loss_mlp": 1.01658869, + "epoch": 0.6068658690554921, + "flos": 20520614424960.0, + "grad_norm": 1.722803665476089, + "language_loss": 0.82395798, + "learning_rate": 1.4136096751102523e-06, + "loss": 0.84579033, + "num_input_tokens_seen": 108786685, + "step": 5047, + "time_per_iteration": 2.447263479232788 + }, + { + "auxiliary_loss_clip": 0.01149321, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.0499053, + "balance_loss_mlp": 1.01854324, + "epoch": 0.6069861119461312, + "flos": 27374560185600.0, + "grad_norm": 2.05435511508815, + "language_loss": 0.8265512, + "learning_rate": 1.4128649817285516e-06, + "loss": 0.84830701, + "num_input_tokens_seen": 108804820, + "step": 5048, + "time_per_iteration": 2.5240516662597656 + }, + { + "auxiliary_loss_clip": 0.01149415, + "auxiliary_loss_mlp": 0.01039071, + "balance_loss_clip": 1.04676509, + "balance_loss_mlp": 1.03107178, + "epoch": 0.6071063548367702, + "flos": 25626500904960.0, + "grad_norm": 2.6059429650678747, + "language_loss": 0.63015944, + "learning_rate": 1.412120377414766e-06, + "loss": 0.6520443, + "num_input_tokens_seen": 108825010, + "step": 5049, + "time_per_iteration": 2.5339953899383545 + }, + { + "auxiliary_loss_clip": 0.01176174, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.05325937, + "balance_loss_mlp": 1.02182603, + "epoch": 0.6072265977274094, + "flos": 24460517520000.0, + "grad_norm": 1.5043191827552223, + "language_loss": 0.71129543, + "learning_rate": 1.4113758622818522e-06, + "loss": 0.73335361, + "num_input_tokens_seen": 108845075, + "step": 5050, + "time_per_iteration": 2.453789710998535 + }, + { + "auxiliary_loss_clip": 0.01151449, + "auxiliary_loss_mlp": 0.00762229, + "balance_loss_clip": 1.05008698, + "balance_loss_mlp": 1.00051451, + "epoch": 0.6073468406180484, + "flos": 18149253413760.0, + "grad_norm": 1.7414512563671498, + "language_loss": 0.82665896, + "learning_rate": 1.410631436442751e-06, + "loss": 0.84579575, + "num_input_tokens_seen": 108863870, + "step": 5051, + "time_per_iteration": 2.470334529876709 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.05019867, + "balance_loss_mlp": 1.01738906, + "epoch": 0.6074670835086875, + "flos": 20697617669760.0, + "grad_norm": 2.377530585480949, + "language_loss": 0.86479461, + "learning_rate": 1.4098871000103936e-06, + "loss": 0.88668728, + "num_input_tokens_seen": 108882470, + "step": 5052, + "time_per_iteration": 2.4396162033081055 + }, + { + "auxiliary_loss_clip": 0.01145703, + "auxiliary_loss_mlp": 0.01023482, + "balance_loss_clip": 1.04607844, + "balance_loss_mlp": 1.01618075, + "epoch": 0.6075873263993267, + "flos": 23769955572480.0, + "grad_norm": 1.6721580359970072, + "language_loss": 0.82650232, + "learning_rate": 1.409142853097693e-06, + "loss": 0.84819412, + "num_input_tokens_seen": 108902710, + "step": 5053, + "time_per_iteration": 2.4990317821502686 + }, + { + "auxiliary_loss_clip": 0.01147906, + "auxiliary_loss_mlp": 0.01025782, + "balance_loss_clip": 1.04817462, + "balance_loss_mlp": 1.01827765, + "epoch": 0.6077075692899657, + "flos": 24454484035200.0, + "grad_norm": 2.023040004575697, + "language_loss": 0.79450166, + "learning_rate": 1.408398695817553e-06, + "loss": 0.81623852, + "num_input_tokens_seen": 108919935, + "step": 5054, + "time_per_iteration": 2.5154056549072266 + }, + { + "auxiliary_loss_clip": 0.01144591, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.04607213, + "balance_loss_mlp": 1.02765226, + "epoch": 0.6078278121806048, + "flos": 27382102041600.0, + "grad_norm": 1.5839432614312505, + "language_loss": 0.70070207, + "learning_rate": 1.4076546282828593e-06, + "loss": 0.72251284, + "num_input_tokens_seen": 108942790, + "step": 5055, + "time_per_iteration": 2.54927134513855 + }, + { + "auxiliary_loss_clip": 0.0114762, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.04312754, + "balance_loss_mlp": 1.01954412, + "epoch": 0.6079480550712439, + "flos": 38436447306240.0, + "grad_norm": 2.9606739596897453, + "language_loss": 0.65790534, + "learning_rate": 1.4069106506064874e-06, + "loss": 0.67964995, + "num_input_tokens_seen": 108964215, + "step": 5056, + "time_per_iteration": 3.3528690338134766 + }, + { + "auxiliary_loss_clip": 0.01141754, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.04757166, + "balance_loss_mlp": 1.0200361, + "epoch": 0.608068297961883, + "flos": 25336271013120.0, + "grad_norm": 2.5495524593936567, + "language_loss": 0.78132248, + "learning_rate": 1.4061667629012989e-06, + "loss": 0.80301875, + "num_input_tokens_seen": 108984885, + "step": 5057, + "time_per_iteration": 2.508213996887207 + }, + { + "auxiliary_loss_clip": 0.01137808, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.04803705, + "balance_loss_mlp": 1.01888633, + "epoch": 0.608188540852522, + "flos": 24202463235840.0, + "grad_norm": 3.958121331552012, + "language_loss": 0.83207607, + "learning_rate": 1.40542296528014e-06, + "loss": 0.85371894, + "num_input_tokens_seen": 109004545, + "step": 5058, + "time_per_iteration": 2.5215721130371094 + }, + { + "auxiliary_loss_clip": 0.01159327, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.04754245, + "balance_loss_mlp": 1.02409744, + "epoch": 0.6083087837431612, + "flos": 21284146851840.0, + "grad_norm": 2.121510698521352, + "language_loss": 0.75962627, + "learning_rate": 1.4046792578558452e-06, + "loss": 0.78154087, + "num_input_tokens_seen": 109022440, + "step": 5059, + "time_per_iteration": 3.316783905029297 + }, + { + "auxiliary_loss_clip": 0.01141027, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.04591596, + "balance_loss_mlp": 1.02095699, + "epoch": 0.6084290266338003, + "flos": 16471435178880.0, + "grad_norm": 2.205578236021955, + "language_loss": 0.76122385, + "learning_rate": 1.4039356407412325e-06, + "loss": 0.78292328, + "num_input_tokens_seen": 109035680, + "step": 5060, + "time_per_iteration": 2.4452011585235596 + }, + { + "auxiliary_loss_clip": 0.01066352, + "auxiliary_loss_mlp": 0.0100103, + "balance_loss_clip": 1.01757431, + "balance_loss_mlp": 1.00011778, + "epoch": 0.6085492695244393, + "flos": 66443574931200.0, + "grad_norm": 0.7881127065018196, + "language_loss": 0.57123005, + "learning_rate": 1.40319211404911e-06, + "loss": 0.59190392, + "num_input_tokens_seen": 109090680, + "step": 5061, + "time_per_iteration": 3.8978376388549805 + }, + { + "auxiliary_loss_clip": 0.01174727, + "auxiliary_loss_mlp": 0.01027046, + "balance_loss_clip": 1.05089557, + "balance_loss_mlp": 1.01908255, + "epoch": 0.6086695124150785, + "flos": 23618986709760.0, + "grad_norm": 1.795289184050645, + "language_loss": 0.9066298, + "learning_rate": 1.4024486778922691e-06, + "loss": 0.92864752, + "num_input_tokens_seen": 109108995, + "step": 5062, + "time_per_iteration": 2.4421961307525635 + }, + { + "auxiliary_loss_clip": 0.01149667, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.045959, + "balance_loss_mlp": 1.02214026, + "epoch": 0.6087897553057176, + "flos": 20157054917760.0, + "grad_norm": 1.8913959386274457, + "language_loss": 0.77492893, + "learning_rate": 1.4017053323834884e-06, + "loss": 0.79672498, + "num_input_tokens_seen": 109128825, + "step": 5063, + "time_per_iteration": 2.488363265991211 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01025676, + "balance_loss_clip": 1.04640567, + "balance_loss_mlp": 1.01812387, + "epoch": 0.6089099981963566, + "flos": 25482535194240.0, + "grad_norm": 1.933533855319585, + "language_loss": 0.75814879, + "learning_rate": 1.4009620776355333e-06, + "loss": 0.77988923, + "num_input_tokens_seen": 109150425, + "step": 5064, + "time_per_iteration": 3.2686140537261963 + }, + { + "auxiliary_loss_clip": 0.01157879, + "auxiliary_loss_mlp": 0.01022043, + "balance_loss_clip": 1.04796982, + "balance_loss_mlp": 1.01452076, + "epoch": 0.6090302410869958, + "flos": 25332895134720.0, + "grad_norm": 1.763732955852719, + "language_loss": 0.79360533, + "learning_rate": 1.4002189137611553e-06, + "loss": 0.81540453, + "num_input_tokens_seen": 109169765, + "step": 5065, + "time_per_iteration": 2.4963266849517822 + }, + { + "auxiliary_loss_clip": 0.01158074, + "auxiliary_loss_mlp": 0.01024063, + "balance_loss_clip": 1.04803658, + "balance_loss_mlp": 1.01673794, + "epoch": 0.6091504839776348, + "flos": 23987358639360.0, + "grad_norm": 1.6654030639640247, + "language_loss": 0.69669372, + "learning_rate": 1.3994758408730901e-06, + "loss": 0.7185151, + "num_input_tokens_seen": 109188950, + "step": 5066, + "time_per_iteration": 2.472099781036377 + }, + { + "auxiliary_loss_clip": 0.01148095, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_clip": 1.0494163, + "balance_loss_mlp": 1.0162847, + "epoch": 0.6092707268682739, + "flos": 29643037666560.0, + "grad_norm": 2.7554103288725544, + "language_loss": 0.76339543, + "learning_rate": 1.3987328590840629e-06, + "loss": 0.78512293, + "num_input_tokens_seen": 109209895, + "step": 5067, + "time_per_iteration": 2.6054129600524902 + }, + { + "auxiliary_loss_clip": 0.01156096, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.04747033, + "balance_loss_mlp": 1.0207144, + "epoch": 0.609390969758913, + "flos": 24024957200640.0, + "grad_norm": 2.075536064425432, + "language_loss": 0.86368924, + "learning_rate": 1.397989968506783e-06, + "loss": 0.88553053, + "num_input_tokens_seen": 109228905, + "step": 5068, + "time_per_iteration": 2.4662129878997803 + }, + { + "auxiliary_loss_clip": 0.01179347, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.05269992, + "balance_loss_mlp": 1.02661729, + "epoch": 0.6095112126495521, + "flos": 11102143288320.0, + "grad_norm": 2.228408867935824, + "language_loss": 0.72331738, + "learning_rate": 1.3972471692539458e-06, + "loss": 0.74545658, + "num_input_tokens_seen": 109243620, + "step": 5069, + "time_per_iteration": 2.3966426849365234 + }, + { + "auxiliary_loss_clip": 0.01142669, + "auxiliary_loss_mlp": 0.01023374, + "balance_loss_clip": 1.04760742, + "balance_loss_mlp": 1.01561379, + "epoch": 0.6096314555401912, + "flos": 17265491187840.0, + "grad_norm": 2.8268495863944274, + "language_loss": 0.75498974, + "learning_rate": 1.3965044614382348e-06, + "loss": 0.77665019, + "num_input_tokens_seen": 109259070, + "step": 5070, + "time_per_iteration": 2.445420742034912 + }, + { + "auxiliary_loss_clip": 0.01177421, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.05192733, + "balance_loss_mlp": 1.01843357, + "epoch": 0.6097516984308303, + "flos": 21645910679040.0, + "grad_norm": 2.1283286011042972, + "language_loss": 0.75576556, + "learning_rate": 1.3957618451723162e-06, + "loss": 0.77780211, + "num_input_tokens_seen": 109275100, + "step": 5071, + "time_per_iteration": 2.426062822341919 + }, + { + "auxiliary_loss_clip": 0.01146785, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.04809344, + "balance_loss_mlp": 1.02062118, + "epoch": 0.6098719413214694, + "flos": 27199208966400.0, + "grad_norm": 1.9461103588496604, + "language_loss": 0.71213925, + "learning_rate": 1.3950193205688457e-06, + "loss": 0.73388755, + "num_input_tokens_seen": 109294825, + "step": 5072, + "time_per_iteration": 2.5500707626342773 + }, + { + "auxiliary_loss_clip": 0.01143449, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.04828525, + "balance_loss_mlp": 1.01748812, + "epoch": 0.6099921842121084, + "flos": 20412954385920.0, + "grad_norm": 1.839722369053823, + "language_loss": 0.83545953, + "learning_rate": 1.3942768877404627e-06, + "loss": 0.85714644, + "num_input_tokens_seen": 109313790, + "step": 5073, + "time_per_iteration": 2.476454973220825 + }, + { + "auxiliary_loss_clip": 0.01172301, + "auxiliary_loss_mlp": 0.01026507, + "balance_loss_clip": 1.04937267, + "balance_loss_mlp": 1.01946735, + "epoch": 0.6101124271027476, + "flos": 23366139897600.0, + "grad_norm": 1.548529755196695, + "language_loss": 0.73547733, + "learning_rate": 1.393534546799795e-06, + "loss": 0.75746548, + "num_input_tokens_seen": 109333490, + "step": 5074, + "time_per_iteration": 2.4650776386260986 + }, + { + "auxiliary_loss_clip": 0.01138827, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.04760671, + "balance_loss_mlp": 1.0218401, + "epoch": 0.6102326699933867, + "flos": 26687840993280.0, + "grad_norm": 1.7553216402234515, + "language_loss": 0.67627013, + "learning_rate": 1.3927922978594536e-06, + "loss": 0.69796014, + "num_input_tokens_seen": 109354575, + "step": 5075, + "time_per_iteration": 2.534072160720825 + }, + { + "auxiliary_loss_clip": 0.01060701, + "auxiliary_loss_mlp": 0.01001463, + "balance_loss_clip": 1.01683462, + "balance_loss_mlp": 1.00058126, + "epoch": 0.6103529128840257, + "flos": 60644612551680.0, + "grad_norm": 0.771954237713556, + "language_loss": 0.57446122, + "learning_rate": 1.3920501410320387e-06, + "loss": 0.59508288, + "num_input_tokens_seen": 109410690, + "step": 5076, + "time_per_iteration": 2.9833619594573975 + }, + { + "auxiliary_loss_clip": 0.01146146, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.04586792, + "balance_loss_mlp": 1.01887441, + "epoch": 0.6104731557746649, + "flos": 19021307806080.0, + "grad_norm": 2.046806922183719, + "language_loss": 0.75950503, + "learning_rate": 1.3913080764301333e-06, + "loss": 0.78123569, + "num_input_tokens_seen": 109427650, + "step": 5077, + "time_per_iteration": 2.460766553878784 + }, + { + "auxiliary_loss_clip": 0.01126712, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.04381859, + "balance_loss_mlp": 1.02750337, + "epoch": 0.6105933986653039, + "flos": 23366894083200.0, + "grad_norm": 1.9358012460082523, + "language_loss": 0.7128607, + "learning_rate": 1.3905661041663085e-06, + "loss": 0.73448014, + "num_input_tokens_seen": 109448835, + "step": 5078, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.01161571, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.05051482, + "balance_loss_mlp": 1.02599907, + "epoch": 0.610713641555943, + "flos": 34637565006720.0, + "grad_norm": 2.2979424204514616, + "language_loss": 0.65208471, + "learning_rate": 1.389824224353122e-06, + "loss": 0.67404014, + "num_input_tokens_seen": 109470425, + "step": 5079, + "time_per_iteration": 2.5610344409942627 + }, + { + "auxiliary_loss_clip": 0.0115995, + "auxiliary_loss_mlp": 0.01023647, + "balance_loss_clip": 1.05076504, + "balance_loss_mlp": 1.0160296, + "epoch": 0.610833884446582, + "flos": 26646471504000.0, + "grad_norm": 1.644420073547241, + "language_loss": 0.76982391, + "learning_rate": 1.389082437103115e-06, + "loss": 0.79165983, + "num_input_tokens_seen": 109489695, + "step": 5080, + "time_per_iteration": 2.4779303073883057 + }, + { + "auxiliary_loss_clip": 0.01129828, + "auxiliary_loss_mlp": 0.01025592, + "balance_loss_clip": 1.04391813, + "balance_loss_mlp": 1.01730132, + "epoch": 0.6109541273372212, + "flos": 21215126868480.0, + "grad_norm": 2.9723896096586984, + "language_loss": 0.77760744, + "learning_rate": 1.3883407425288172e-06, + "loss": 0.79916167, + "num_input_tokens_seen": 109510030, + "step": 5081, + "time_per_iteration": 2.553284168243408 + }, + { + "auxiliary_loss_clip": 0.01142196, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.04558182, + "balance_loss_mlp": 1.02015734, + "epoch": 0.6110743702278603, + "flos": 20084084438400.0, + "grad_norm": 2.133305995274337, + "language_loss": 0.79975712, + "learning_rate": 1.3875991407427417e-06, + "loss": 0.82145822, + "num_input_tokens_seen": 109528255, + "step": 5082, + "time_per_iteration": 3.2947733402252197 + }, + { + "auxiliary_loss_clip": 0.01046629, + "auxiliary_loss_mlp": 0.01002104, + "balance_loss_clip": 1.01670015, + "balance_loss_mlp": 1.00110853, + "epoch": 0.6111946131184993, + "flos": 68302957438080.0, + "grad_norm": 0.7707187304039955, + "language_loss": 0.58226377, + "learning_rate": 1.38685763185739e-06, + "loss": 0.60275108, + "num_input_tokens_seen": 109581915, + "step": 5083, + "time_per_iteration": 3.105914831161499 + }, + { + "auxiliary_loss_clip": 0.01173656, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.05030227, + "balance_loss_mlp": 1.01804602, + "epoch": 0.6113148560091385, + "flos": 19937676602880.0, + "grad_norm": 2.415949856770844, + "language_loss": 0.67725444, + "learning_rate": 1.3861162159852476e-06, + "loss": 0.69925183, + "num_input_tokens_seen": 109600050, + "step": 5084, + "time_per_iteration": 2.410493850708008 + }, + { + "auxiliary_loss_clip": 0.01151265, + "auxiliary_loss_mlp": 0.01026259, + "balance_loss_clip": 1.04899645, + "balance_loss_mlp": 1.01778293, + "epoch": 0.6114350988997775, + "flos": 23731854220800.0, + "grad_norm": 2.6658033470219458, + "language_loss": 0.80039108, + "learning_rate": 1.3853748932387875e-06, + "loss": 0.82216632, + "num_input_tokens_seen": 109620690, + "step": 5085, + "time_per_iteration": 3.3655753135681152 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01021389, + "balance_loss_clip": 1.04547048, + "balance_loss_mlp": 1.01365805, + "epoch": 0.6115553417904166, + "flos": 24023700224640.0, + "grad_norm": 2.353728573836011, + "language_loss": 0.75099081, + "learning_rate": 1.3846336637304671e-06, + "loss": 0.77254653, + "num_input_tokens_seen": 109638960, + "step": 5086, + "time_per_iteration": 2.484530210494995 + }, + { + "auxiliary_loss_clip": 0.01141938, + "auxiliary_loss_mlp": 0.01023628, + "balance_loss_clip": 1.04962564, + "balance_loss_mlp": 1.01587987, + "epoch": 0.6116755846810558, + "flos": 23733542160000.0, + "grad_norm": 1.8829038915104588, + "language_loss": 0.83109522, + "learning_rate": 1.3838925275727316e-06, + "loss": 0.8527509, + "num_input_tokens_seen": 109659700, + "step": 5087, + "time_per_iteration": 2.5030009746551514 + }, + { + "auxiliary_loss_clip": 0.01175185, + "auxiliary_loss_mlp": 0.01024154, + "balance_loss_clip": 1.05197108, + "balance_loss_mlp": 1.01677525, + "epoch": 0.6117958275716948, + "flos": 18661626967680.0, + "grad_norm": 2.269638300784814, + "language_loss": 0.79136729, + "learning_rate": 1.3831514848780089e-06, + "loss": 0.81336069, + "num_input_tokens_seen": 109679275, + "step": 5088, + "time_per_iteration": 2.4074864387512207 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.04809511, + "balance_loss_mlp": 1.02032161, + "epoch": 0.6119160704623339, + "flos": 16471183783680.0, + "grad_norm": 2.675359491424576, + "language_loss": 0.91493845, + "learning_rate": 1.3824105357587152e-06, + "loss": 0.93676007, + "num_input_tokens_seen": 109696380, + "step": 5089, + "time_per_iteration": 3.2116165161132812 + }, + { + "auxiliary_loss_clip": 0.01140288, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.04482126, + "balance_loss_mlp": 1.01857138, + "epoch": 0.612036313352973, + "flos": 23915465568000.0, + "grad_norm": 1.5189165546542074, + "language_loss": 0.82676542, + "learning_rate": 1.381669680327253e-06, + "loss": 0.84843129, + "num_input_tokens_seen": 109718060, + "step": 5090, + "time_per_iteration": 3.291295051574707 + }, + { + "auxiliary_loss_clip": 0.01141033, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.04912925, + "balance_loss_mlp": 1.01788795, + "epoch": 0.6121565562436121, + "flos": 26974766833920.0, + "grad_norm": 1.9626741935955292, + "language_loss": 0.7062344, + "learning_rate": 1.380928918696008e-06, + "loss": 0.72790504, + "num_input_tokens_seen": 109736830, + "step": 5091, + "time_per_iteration": 2.530230760574341 + }, + { + "auxiliary_loss_clip": 0.011591, + "auxiliary_loss_mlp": 0.01025017, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.01716101, + "epoch": 0.6122767991342511, + "flos": 15668867646720.0, + "grad_norm": 3.1379894187401, + "language_loss": 0.71554208, + "learning_rate": 1.3801882509773548e-06, + "loss": 0.73738325, + "num_input_tokens_seen": 109754690, + "step": 5092, + "time_per_iteration": 2.4204964637756348 + }, + { + "auxiliary_loss_clip": 0.01154851, + "auxiliary_loss_mlp": 0.01025703, + "balance_loss_clip": 1.04695797, + "balance_loss_mlp": 1.01766789, + "epoch": 0.6123970420248903, + "flos": 27964321591680.0, + "grad_norm": 4.2475396185683, + "language_loss": 0.81701678, + "learning_rate": 1.3794476772836503e-06, + "loss": 0.83882236, + "num_input_tokens_seen": 109775790, + "step": 5093, + "time_per_iteration": 2.506866693496704 + }, + { + "auxiliary_loss_clip": 0.01126149, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.04717636, + "balance_loss_mlp": 1.02154219, + "epoch": 0.6125172849155294, + "flos": 21468727866240.0, + "grad_norm": 1.6608983218580216, + "language_loss": 0.84496176, + "learning_rate": 1.3787071977272402e-06, + "loss": 0.86652255, + "num_input_tokens_seen": 109795050, + "step": 5094, + "time_per_iteration": 2.5703883171081543 + }, + { + "auxiliary_loss_clip": 0.01112877, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.0462805, + "balance_loss_mlp": 1.02293372, + "epoch": 0.6126375278061684, + "flos": 16248321849600.0, + "grad_norm": 3.09254998674022, + "language_loss": 0.71797681, + "learning_rate": 1.3779668124204535e-06, + "loss": 0.73941302, + "num_input_tokens_seen": 109811465, + "step": 5095, + "time_per_iteration": 2.509732484817505 + }, + { + "auxiliary_loss_clip": 0.01141317, + "auxiliary_loss_mlp": 0.01027106, + "balance_loss_clip": 1.04918694, + "balance_loss_mlp": 1.01908934, + "epoch": 0.6127577706968076, + "flos": 20448865008000.0, + "grad_norm": 1.501077716527644, + "language_loss": 0.80910319, + "learning_rate": 1.3772265214756074e-06, + "loss": 0.83078742, + "num_input_tokens_seen": 109831225, + "step": 5096, + "time_per_iteration": 2.4700565338134766 + }, + { + "auxiliary_loss_clip": 0.01161292, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.04705644, + "balance_loss_mlp": 1.0211854, + "epoch": 0.6128780135874466, + "flos": 18260397072000.0, + "grad_norm": 1.8857198553529981, + "language_loss": 0.75419867, + "learning_rate": 1.3764863250050025e-06, + "loss": 0.77609956, + "num_input_tokens_seen": 109849465, + "step": 5097, + "time_per_iteration": 2.4436445236206055 + }, + { + "auxiliary_loss_clip": 0.01132261, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.04572415, + "balance_loss_mlp": 1.02087569, + "epoch": 0.6129982564780857, + "flos": 24937088192640.0, + "grad_norm": 1.792018576268544, + "language_loss": 0.80602854, + "learning_rate": 1.3757462231209272e-06, + "loss": 0.8276341, + "num_input_tokens_seen": 109869770, + "step": 5098, + "time_per_iteration": 2.5587480068206787 + }, + { + "auxiliary_loss_clip": 0.01139473, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.04609632, + "balance_loss_mlp": 1.01593113, + "epoch": 0.6131184993687249, + "flos": 22492038430080.0, + "grad_norm": 1.983903909425592, + "language_loss": 0.88618672, + "learning_rate": 1.3750062159356525e-06, + "loss": 0.90782213, + "num_input_tokens_seen": 109889120, + "step": 5099, + "time_per_iteration": 2.504131555557251 + }, + { + "auxiliary_loss_clip": 0.0112095, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.04404151, + "balance_loss_mlp": 1.01998353, + "epoch": 0.6132387422593639, + "flos": 15885839750400.0, + "grad_norm": 1.6749461859328214, + "language_loss": 0.83296037, + "learning_rate": 1.3742663035614382e-06, + "loss": 0.85444415, + "num_input_tokens_seen": 109906490, + "step": 5100, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01175817, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.05095983, + "balance_loss_mlp": 1.0227834, + "epoch": 0.613358985150003, + "flos": 25411539962880.0, + "grad_norm": 3.079416876949993, + "language_loss": 0.80385959, + "learning_rate": 1.3735264861105283e-06, + "loss": 0.82592654, + "num_input_tokens_seen": 109927130, + "step": 5101, + "time_per_iteration": 2.4548351764678955 + }, + { + "auxiliary_loss_clip": 0.01133169, + "auxiliary_loss_mlp": 0.01026517, + "balance_loss_clip": 1.04508257, + "balance_loss_mlp": 1.01901579, + "epoch": 0.6134792280406421, + "flos": 21361283308800.0, + "grad_norm": 2.121434392117287, + "language_loss": 0.78416133, + "learning_rate": 1.372786763695152e-06, + "loss": 0.80575818, + "num_input_tokens_seen": 109945890, + "step": 5102, + "time_per_iteration": 2.516022205352783 + }, + { + "auxiliary_loss_clip": 0.01160849, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.04775882, + "balance_loss_mlp": 1.024001, + "epoch": 0.6135994709312812, + "flos": 21211248199680.0, + "grad_norm": 3.0606831251367246, + "language_loss": 0.77118242, + "learning_rate": 1.3720471364275257e-06, + "loss": 0.79311073, + "num_input_tokens_seen": 109965535, + "step": 5103, + "time_per_iteration": 2.4676713943481445 + }, + { + "auxiliary_loss_clip": 0.0112764, + "auxiliary_loss_mlp": 0.00762904, + "balance_loss_clip": 1.04507315, + "balance_loss_mlp": 1.0007174, + "epoch": 0.6137197138219203, + "flos": 14794047907200.0, + "grad_norm": 2.074752507963412, + "language_loss": 0.78232431, + "learning_rate": 1.3713076044198486e-06, + "loss": 0.80122972, + "num_input_tokens_seen": 109982345, + "step": 5104, + "time_per_iteration": 2.4856326580047607 + }, + { + "auxiliary_loss_clip": 0.01140532, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.04648161, + "balance_loss_mlp": 1.02377343, + "epoch": 0.6138399567125594, + "flos": 20084515401600.0, + "grad_norm": 2.6463185183969746, + "language_loss": 0.81177032, + "learning_rate": 1.3705681677843086e-06, + "loss": 0.83349407, + "num_input_tokens_seen": 110000940, + "step": 5105, + "time_per_iteration": 2.4687111377716064 + }, + { + "auxiliary_loss_clip": 0.01073599, + "auxiliary_loss_mlp": 0.01001425, + "balance_loss_clip": 1.01608825, + "balance_loss_mlp": 1.00047779, + "epoch": 0.6139601996031985, + "flos": 60123838193280.0, + "grad_norm": 0.8062958448978689, + "language_loss": 0.60640794, + "learning_rate": 1.3698288266330768e-06, + "loss": 0.62715822, + "num_input_tokens_seen": 110061565, + "step": 5106, + "time_per_iteration": 3.095419406890869 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01023012, + "balance_loss_clip": 1.05189347, + "balance_loss_mlp": 1.01555526, + "epoch": 0.6140804424938375, + "flos": 23586703361280.0, + "grad_norm": 3.761532254880022, + "language_loss": 0.72768337, + "learning_rate": 1.3690895810783113e-06, + "loss": 0.74934709, + "num_input_tokens_seen": 110080360, + "step": 5107, + "time_per_iteration": 2.5079920291900635 + }, + { + "auxiliary_loss_clip": 0.01111024, + "auxiliary_loss_mlp": 0.00762779, + "balance_loss_clip": 1.0410949, + "balance_loss_mlp": 1.00069571, + "epoch": 0.6142006853844767, + "flos": 21398199511680.0, + "grad_norm": 2.859343515641965, + "language_loss": 0.71429312, + "learning_rate": 1.3683504312321543e-06, + "loss": 0.73303109, + "num_input_tokens_seen": 110100695, + "step": 5108, + "time_per_iteration": 3.3253512382507324 + }, + { + "auxiliary_loss_clip": 0.01164364, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.04948795, + "balance_loss_mlp": 1.01930869, + "epoch": 0.6143209282751158, + "flos": 12057367622400.0, + "grad_norm": 2.2641694690977325, + "language_loss": 0.80268037, + "learning_rate": 1.3676113772067355e-06, + "loss": 0.82459706, + "num_input_tokens_seen": 110117750, + "step": 5109, + "time_per_iteration": 2.4397342205047607 + }, + { + "auxiliary_loss_clip": 0.01122927, + "auxiliary_loss_mlp": 0.01024662, + "balance_loss_clip": 1.04479313, + "balance_loss_mlp": 1.01677656, + "epoch": 0.6144411711657548, + "flos": 25082274965760.0, + "grad_norm": 1.7993815904853432, + "language_loss": 0.72608173, + "learning_rate": 1.3668724191141671e-06, + "loss": 0.74755764, + "num_input_tokens_seen": 110137020, + "step": 5110, + "time_per_iteration": 2.574392557144165 + }, + { + "auxiliary_loss_clip": 0.01131348, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.05282533, + "balance_loss_mlp": 1.02523756, + "epoch": 0.6145614140563939, + "flos": 20114069316480.0, + "grad_norm": 2.168741696093098, + "language_loss": 0.6672402, + "learning_rate": 1.3661335570665493e-06, + "loss": 0.68888962, + "num_input_tokens_seen": 110154930, + "step": 5111, + "time_per_iteration": 2.508439302444458 + }, + { + "auxiliary_loss_clip": 0.01151295, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.05130553, + "balance_loss_mlp": 1.02132928, + "epoch": 0.614681656947033, + "flos": 16800376953600.0, + "grad_norm": 2.4082095880795484, + "language_loss": 0.70047927, + "learning_rate": 1.3653947911759676e-06, + "loss": 0.72228116, + "num_input_tokens_seen": 110172480, + "step": 5112, + "time_per_iteration": 3.3108434677124023 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.04393744, + "balance_loss_mlp": 1.0240581, + "epoch": 0.6148018998376721, + "flos": 38801587011840.0, + "grad_norm": 1.9050274812869563, + "language_loss": 0.74706751, + "learning_rate": 1.3646561215544904e-06, + "loss": 0.76849675, + "num_input_tokens_seen": 110197120, + "step": 5113, + "time_per_iteration": 2.7064969539642334 + }, + { + "auxiliary_loss_clip": 0.01161413, + "auxiliary_loss_mlp": 0.01024166, + "balance_loss_clip": 1.05038035, + "balance_loss_mlp": 1.01647115, + "epoch": 0.6149221427283111, + "flos": 23327032965120.0, + "grad_norm": 2.196896747359998, + "language_loss": 0.79495418, + "learning_rate": 1.363917548314176e-06, + "loss": 0.81681001, + "num_input_tokens_seen": 110216385, + "step": 5114, + "time_per_iteration": 2.507715940475464 + }, + { + "auxiliary_loss_clip": 0.01167551, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.04987669, + "balance_loss_mlp": 1.02024519, + "epoch": 0.6150423856189503, + "flos": 22379494141440.0, + "grad_norm": 1.6859480971957332, + "language_loss": 0.73069608, + "learning_rate": 1.3631790715670626e-06, + "loss": 0.75265205, + "num_input_tokens_seen": 110234790, + "step": 5115, + "time_per_iteration": 3.320070266723633 + }, + { + "auxiliary_loss_clip": 0.01080253, + "auxiliary_loss_mlp": 0.01024134, + "balance_loss_clip": 1.04433727, + "balance_loss_mlp": 1.01688588, + "epoch": 0.6151626285095894, + "flos": 18692078722560.0, + "grad_norm": 1.8132990081572578, + "language_loss": 0.85411805, + "learning_rate": 1.3624406914251783e-06, + "loss": 0.87516189, + "num_input_tokens_seen": 110251910, + "step": 5116, + "time_per_iteration": 2.614272356033325 + }, + { + "auxiliary_loss_clip": 0.011616, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.04844487, + "balance_loss_mlp": 1.0209291, + "epoch": 0.6152828714002284, + "flos": 15851688894720.0, + "grad_norm": 1.875999404706754, + "language_loss": 0.87926996, + "learning_rate": 1.3617024080005335e-06, + "loss": 0.90116704, + "num_input_tokens_seen": 110268810, + "step": 5117, + "time_per_iteration": 3.205634593963623 + }, + { + "auxiliary_loss_clip": 0.01148334, + "auxiliary_loss_mlp": 0.00762388, + "balance_loss_clip": 1.04618704, + "balance_loss_mlp": 1.00068903, + "epoch": 0.6154031142908676, + "flos": 24869792062080.0, + "grad_norm": 1.5118296874236248, + "language_loss": 0.74606073, + "learning_rate": 1.3609642214051266e-06, + "loss": 0.76516789, + "num_input_tokens_seen": 110293035, + "step": 5118, + "time_per_iteration": 2.586886405944824 + }, + { + "auxiliary_loss_clip": 0.01143076, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.05111957, + "balance_loss_mlp": 1.02277803, + "epoch": 0.6155233571815066, + "flos": 19244744357760.0, + "grad_norm": 1.8796245593984653, + "language_loss": 0.66211236, + "learning_rate": 1.3602261317509385e-06, + "loss": 0.68385661, + "num_input_tokens_seen": 110309695, + "step": 5119, + "time_per_iteration": 2.4761598110198975 + }, + { + "auxiliary_loss_clip": 0.0116176, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.04947948, + "balance_loss_mlp": 1.01654756, + "epoch": 0.6156436000721457, + "flos": 18770077105920.0, + "grad_norm": 2.692538032462891, + "language_loss": 0.82466644, + "learning_rate": 1.3594881391499387e-06, + "loss": 0.84653413, + "num_input_tokens_seen": 110328610, + "step": 5120, + "time_per_iteration": 2.4384398460388184 + }, + { + "auxiliary_loss_clip": 0.01149388, + "auxiliary_loss_mlp": 0.01026951, + "balance_loss_clip": 1.04891694, + "balance_loss_mlp": 1.0190537, + "epoch": 0.6157638429627849, + "flos": 18041198325120.0, + "grad_norm": 2.0166243919535636, + "language_loss": 0.79189581, + "learning_rate": 1.3587502437140778e-06, + "loss": 0.81365919, + "num_input_tokens_seen": 110346775, + "step": 5121, + "time_per_iteration": 2.4741883277893066 + }, + { + "auxiliary_loss_clip": 0.01149189, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.04741859, + "balance_loss_mlp": 1.02150834, + "epoch": 0.6158840858534239, + "flos": 25556726736000.0, + "grad_norm": 2.2288144063697803, + "language_loss": 0.85067892, + "learning_rate": 1.3580124455552952e-06, + "loss": 0.87246656, + "num_input_tokens_seen": 110366140, + "step": 5122, + "time_per_iteration": 2.5107157230377197 + }, + { + "auxiliary_loss_clip": 0.01161623, + "auxiliary_loss_mlp": 0.00761794, + "balance_loss_clip": 1.0498805, + "balance_loss_mlp": 1.00073385, + "epoch": 0.616004328744063, + "flos": 24640788902400.0, + "grad_norm": 1.6461498777306336, + "language_loss": 0.87382662, + "learning_rate": 1.3572747447855148e-06, + "loss": 0.8930608, + "num_input_tokens_seen": 110386550, + "step": 5123, + "time_per_iteration": 2.4893503189086914 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.05366778, + "balance_loss_mlp": 1.01996326, + "epoch": 0.6161245716347021, + "flos": 21689686379520.0, + "grad_norm": 2.2205169980477075, + "language_loss": 0.69211447, + "learning_rate": 1.356537141516644e-06, + "loss": 0.71417594, + "num_input_tokens_seen": 110403970, + "step": 5124, + "time_per_iteration": 2.4197683334350586 + }, + { + "auxiliary_loss_clip": 0.01162462, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.05254543, + "balance_loss_mlp": 1.01904678, + "epoch": 0.6162448145253412, + "flos": 35189225061120.0, + "grad_norm": 2.1713747739713054, + "language_loss": 0.61966944, + "learning_rate": 1.3557996358605775e-06, + "loss": 0.6415633, + "num_input_tokens_seen": 110423890, + "step": 5125, + "time_per_iteration": 2.5521202087402344 + }, + { + "auxiliary_loss_clip": 0.01159368, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.04893374, + "balance_loss_mlp": 1.02369475, + "epoch": 0.6163650574159802, + "flos": 21615279356160.0, + "grad_norm": 2.137293296134422, + "language_loss": 0.69733453, + "learning_rate": 1.3550622279291941e-06, + "loss": 0.71924269, + "num_input_tokens_seen": 110442035, + "step": 5126, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01026371, + "balance_loss_clip": 1.04277468, + "balance_loss_mlp": 1.01847303, + "epoch": 0.6164853003066194, + "flos": 24572163968640.0, + "grad_norm": 1.4525271136953573, + "language_loss": 0.83256698, + "learning_rate": 1.354324917834358e-06, + "loss": 0.85391128, + "num_input_tokens_seen": 110463280, + "step": 5127, + "time_per_iteration": 2.5621418952941895 + }, + { + "auxiliary_loss_clip": 0.01102617, + "auxiliary_loss_mlp": 0.00762848, + "balance_loss_clip": 1.04464948, + "balance_loss_mlp": 1.00073767, + "epoch": 0.6166055431972585, + "flos": 21835986474240.0, + "grad_norm": 1.740837685526935, + "language_loss": 0.7687853, + "learning_rate": 1.353587705687918e-06, + "loss": 0.78744, + "num_input_tokens_seen": 110481455, + "step": 5128, + "time_per_iteration": 2.589176654815674 + }, + { + "auxiliary_loss_clip": 0.01152411, + "auxiliary_loss_mlp": 0.01026174, + "balance_loss_clip": 1.05061007, + "balance_loss_mlp": 1.01831222, + "epoch": 0.6167257860878975, + "flos": 17785262943360.0, + "grad_norm": 18.805756662928104, + "language_loss": 0.71780175, + "learning_rate": 1.3528505916017096e-06, + "loss": 0.73958755, + "num_input_tokens_seen": 110499155, + "step": 5129, + "time_per_iteration": 2.4418556690216064 + }, + { + "auxiliary_loss_clip": 0.01160673, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.04801965, + "balance_loss_mlp": 1.02291274, + "epoch": 0.6168460289785367, + "flos": 23214811898880.0, + "grad_norm": 1.9956938296100017, + "language_loss": 0.88457108, + "learning_rate": 1.3521135756875514e-06, + "loss": 0.90648663, + "num_input_tokens_seen": 110515470, + "step": 5130, + "time_per_iteration": 2.4279496669769287 + }, + { + "auxiliary_loss_clip": 0.01096143, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.04391456, + "balance_loss_mlp": 1.02001441, + "epoch": 0.6169662718691757, + "flos": 26213281482240.0, + "grad_norm": 1.5922104159147648, + "language_loss": 0.8646912, + "learning_rate": 1.3513766580572496e-06, + "loss": 0.88592708, + "num_input_tokens_seen": 110538290, + "step": 5131, + "time_per_iteration": 2.64207124710083 + }, + { + "auxiliary_loss_clip": 0.01158315, + "auxiliary_loss_mlp": 0.01025482, + "balance_loss_clip": 1.0488596, + "balance_loss_mlp": 1.01830602, + "epoch": 0.6170865147598148, + "flos": 19026120228480.0, + "grad_norm": 2.001301953903268, + "language_loss": 0.77359492, + "learning_rate": 1.3506398388225924e-06, + "loss": 0.79543287, + "num_input_tokens_seen": 110555610, + "step": 5132, + "time_per_iteration": 2.4383535385131836 + }, + { + "auxiliary_loss_clip": 0.0117407, + "auxiliary_loss_mlp": 0.0102423, + "balance_loss_clip": 1.05286646, + "balance_loss_mlp": 1.01690412, + "epoch": 0.617206757650454, + "flos": 18260361158400.0, + "grad_norm": 1.8237794699146703, + "language_loss": 0.71642679, + "learning_rate": 1.349903118095355e-06, + "loss": 0.73840976, + "num_input_tokens_seen": 110574745, + "step": 5133, + "time_per_iteration": 2.411007881164551 + }, + { + "auxiliary_loss_clip": 0.0116385, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.04898012, + "balance_loss_mlp": 1.02131474, + "epoch": 0.617327000541093, + "flos": 18186959715840.0, + "grad_norm": 1.634229230515609, + "language_loss": 0.73205948, + "learning_rate": 1.349166495987298e-06, + "loss": 0.75398946, + "num_input_tokens_seen": 110593310, + "step": 5134, + "time_per_iteration": 2.4359846115112305 + }, + { + "auxiliary_loss_clip": 0.01061904, + "auxiliary_loss_mlp": 0.01021128, + "balance_loss_clip": 1.02895141, + "balance_loss_mlp": 1.02000737, + "epoch": 0.6174472434317321, + "flos": 61833796122240.0, + "grad_norm": 0.8233008660393084, + "language_loss": 0.60909635, + "learning_rate": 1.348429972610166e-06, + "loss": 0.62992668, + "num_input_tokens_seen": 110657615, + "step": 5135, + "time_per_iteration": 3.934006929397583 + }, + { + "auxiliary_loss_clip": 0.01037221, + "auxiliary_loss_mlp": 0.01008393, + "balance_loss_clip": 1.03129709, + "balance_loss_mlp": 1.00718272, + "epoch": 0.6175674863223712, + "flos": 71230970494080.0, + "grad_norm": 0.8490553402599413, + "language_loss": 0.57908899, + "learning_rate": 1.3476935480756897e-06, + "loss": 0.59954512, + "num_input_tokens_seen": 110714365, + "step": 5136, + "time_per_iteration": 3.005431890487671 + }, + { + "auxiliary_loss_clip": 0.01121309, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.0434444, + "balance_loss_mlp": 1.0258677, + "epoch": 0.6176877292130103, + "flos": 21835447770240.0, + "grad_norm": 2.03787729210879, + "language_loss": 0.756091, + "learning_rate": 1.346957222495583e-06, + "loss": 0.77764165, + "num_input_tokens_seen": 110732160, + "step": 5137, + "time_per_iteration": 2.5312812328338623 + }, + { + "auxiliary_loss_clip": 0.01151661, + "auxiliary_loss_mlp": 0.00762518, + "balance_loss_clip": 1.04998922, + "balance_loss_mlp": 1.00070214, + "epoch": 0.6178079721036493, + "flos": 17741738638080.0, + "grad_norm": 3.0990128599154794, + "language_loss": 0.71393383, + "learning_rate": 1.3462209959815466e-06, + "loss": 0.73307556, + "num_input_tokens_seen": 110746900, + "step": 5138, + "time_per_iteration": 2.4329543113708496 + }, + { + "auxiliary_loss_clip": 0.01151018, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.05056465, + "balance_loss_mlp": 1.01985025, + "epoch": 0.6179282149942885, + "flos": 22633131052800.0, + "grad_norm": 1.7254070638110282, + "language_loss": 0.74007356, + "learning_rate": 1.345484868645265e-06, + "loss": 0.76186162, + "num_input_tokens_seen": 110765710, + "step": 5139, + "time_per_iteration": 3.324608087539673 + }, + { + "auxiliary_loss_clip": 0.01140696, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.04719186, + "balance_loss_mlp": 1.02263427, + "epoch": 0.6180484578849276, + "flos": 22310330503680.0, + "grad_norm": 1.93353122568057, + "language_loss": 0.78708959, + "learning_rate": 1.3447488405984088e-06, + "loss": 0.80880433, + "num_input_tokens_seen": 110783970, + "step": 5140, + "time_per_iteration": 2.5369367599487305 + }, + { + "auxiliary_loss_clip": 0.01144588, + "auxiliary_loss_mlp": 0.01024222, + "balance_loss_clip": 1.04718387, + "balance_loss_mlp": 1.01670551, + "epoch": 0.6181687007755666, + "flos": 35225458905600.0, + "grad_norm": 2.1289467633489254, + "language_loss": 0.69880098, + "learning_rate": 1.3440129119526322e-06, + "loss": 0.72048903, + "num_input_tokens_seen": 110806395, + "step": 5141, + "time_per_iteration": 3.3777740001678467 + }, + { + "auxiliary_loss_clip": 0.0107402, + "auxiliary_loss_mlp": 0.01004253, + "balance_loss_clip": 1.01679921, + "balance_loss_mlp": 1.003335, + "epoch": 0.6182889436662057, + "flos": 61547370094080.0, + "grad_norm": 0.8001682396038746, + "language_loss": 0.51233035, + "learning_rate": 1.3432770828195762e-06, + "loss": 0.53311312, + "num_input_tokens_seen": 110867380, + "step": 5142, + "time_per_iteration": 3.17407488822937 + }, + { + "auxiliary_loss_clip": 0.01120848, + "auxiliary_loss_mlp": 0.01022626, + "balance_loss_clip": 1.04271865, + "balance_loss_mlp": 1.01443088, + "epoch": 0.6184091865568448, + "flos": 19609991804160.0, + "grad_norm": 2.3179526312180276, + "language_loss": 0.70428991, + "learning_rate": 1.3425413533108635e-06, + "loss": 0.72572464, + "num_input_tokens_seen": 110885980, + "step": 5143, + "time_per_iteration": 3.317586898803711 + }, + { + "auxiliary_loss_clip": 0.01120033, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.04843521, + "balance_loss_mlp": 1.01929438, + "epoch": 0.6185294294474839, + "flos": 23586882929280.0, + "grad_norm": 2.0891966194110334, + "language_loss": 0.70644093, + "learning_rate": 1.341805723538105e-06, + "loss": 0.72790766, + "num_input_tokens_seen": 110906085, + "step": 5144, + "time_per_iteration": 2.6144511699676514 + }, + { + "auxiliary_loss_clip": 0.01155755, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.05108666, + "balance_loss_mlp": 1.02363276, + "epoch": 0.618649672338123, + "flos": 26762032535040.0, + "grad_norm": 1.6604391015252626, + "language_loss": 0.77731955, + "learning_rate": 1.3410701936128948e-06, + "loss": 0.79919589, + "num_input_tokens_seen": 110928865, + "step": 5145, + "time_per_iteration": 2.5302317142486572 + }, + { + "auxiliary_loss_clip": 0.01163699, + "auxiliary_loss_mlp": 0.01027024, + "balance_loss_clip": 1.05389094, + "balance_loss_mlp": 1.01953161, + "epoch": 0.6187699152287621, + "flos": 14456630522880.0, + "grad_norm": 2.5091876659540846, + "language_loss": 0.84897012, + "learning_rate": 1.340334763646812e-06, + "loss": 0.87087739, + "num_input_tokens_seen": 110943000, + "step": 5146, + "time_per_iteration": 2.3902246952056885 + }, + { + "auxiliary_loss_clip": 0.01177569, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.05235517, + "balance_loss_mlp": 1.02191734, + "epoch": 0.6188901581194012, + "flos": 20084766796800.0, + "grad_norm": 1.8313339041881738, + "language_loss": 0.74298638, + "learning_rate": 1.3395994337514218e-06, + "loss": 0.76506919, + "num_input_tokens_seen": 110963170, + "step": 5147, + "time_per_iteration": 2.4604387283325195 + }, + { + "auxiliary_loss_clip": 0.01153079, + "auxiliary_loss_mlp": 0.01028197, + "balance_loss_clip": 1.04697967, + "balance_loss_mlp": 1.02035332, + "epoch": 0.6190104010100402, + "flos": 25700728360320.0, + "grad_norm": 1.6864189167042607, + "language_loss": 0.78778219, + "learning_rate": 1.3388642040382725e-06, + "loss": 0.80959487, + "num_input_tokens_seen": 110983595, + "step": 5148, + "time_per_iteration": 2.4791977405548096 + }, + { + "auxiliary_loss_clip": 0.01133196, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.04170585, + "balance_loss_mlp": 1.01743031, + "epoch": 0.6191306439006794, + "flos": 30442372974720.0, + "grad_norm": 1.7344759551081834, + "language_loss": 0.84180474, + "learning_rate": 1.3381290746188975e-06, + "loss": 0.86339074, + "num_input_tokens_seen": 111002965, + "step": 5149, + "time_per_iteration": 2.581310272216797 + }, + { + "auxiliary_loss_clip": 0.01162987, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.05254984, + "balance_loss_mlp": 1.02409577, + "epoch": 0.6192508867913185, + "flos": 26685793918080.0, + "grad_norm": 1.6903521483416424, + "language_loss": 0.67075741, + "learning_rate": 1.3373940456048152e-06, + "loss": 0.69271052, + "num_input_tokens_seen": 111022990, + "step": 5150, + "time_per_iteration": 2.4894511699676514 + }, + { + "auxiliary_loss_clip": 0.01174787, + "auxiliary_loss_mlp": 0.01024873, + "balance_loss_clip": 1.05245173, + "balance_loss_mlp": 1.01749969, + "epoch": 0.6193711296819575, + "flos": 36722036090880.0, + "grad_norm": 1.8015590074361678, + "language_loss": 0.5909031, + "learning_rate": 1.3366591171075299e-06, + "loss": 0.61289972, + "num_input_tokens_seen": 111046495, + "step": 5151, + "time_per_iteration": 2.550067186355591 + }, + { + "auxiliary_loss_clip": 0.01145976, + "auxiliary_loss_mlp": 0.0102376, + "balance_loss_clip": 1.04841852, + "balance_loss_mlp": 1.01637459, + "epoch": 0.6194913725725967, + "flos": 25192556697600.0, + "grad_norm": 1.9518248568930159, + "language_loss": 0.91008627, + "learning_rate": 1.335924289238529e-06, + "loss": 0.93178362, + "num_input_tokens_seen": 111065705, + "step": 5152, + "time_per_iteration": 2.516597270965576 + }, + { + "auxiliary_loss_clip": 0.0114438, + "auxiliary_loss_mlp": 0.00763011, + "balance_loss_clip": 1.05101037, + "balance_loss_mlp": 1.00079656, + "epoch": 0.6196116154632357, + "flos": 21178821196800.0, + "grad_norm": 1.5269647474180734, + "language_loss": 0.76932395, + "learning_rate": 1.3351895621092859e-06, + "loss": 0.78839779, + "num_input_tokens_seen": 111086050, + "step": 5153, + "time_per_iteration": 2.5211715698242188 + }, + { + "auxiliary_loss_clip": 0.01081732, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.03751802, + "balance_loss_mlp": 1.02149105, + "epoch": 0.6197318583538748, + "flos": 16253744803200.0, + "grad_norm": 1.8713889016813556, + "language_loss": 0.76723945, + "learning_rate": 1.3344549358312567e-06, + "loss": 0.7883485, + "num_input_tokens_seen": 111104450, + "step": 5154, + "time_per_iteration": 2.616731643676758 + }, + { + "auxiliary_loss_clip": 0.01165772, + "auxiliary_loss_mlp": 0.01023722, + "balance_loss_clip": 1.05164266, + "balance_loss_mlp": 1.01562738, + "epoch": 0.619852101244514, + "flos": 24425612478720.0, + "grad_norm": 1.8889001754231425, + "language_loss": 0.78362107, + "learning_rate": 1.3337204105158852e-06, + "loss": 0.805516, + "num_input_tokens_seen": 111123320, + "step": 5155, + "time_per_iteration": 2.4758620262145996 + }, + { + "auxiliary_loss_clip": 0.01115153, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.03721404, + "balance_loss_mlp": 1.01937151, + "epoch": 0.619972344135153, + "flos": 16727298733440.0, + "grad_norm": 1.8317029600533525, + "language_loss": 0.72757632, + "learning_rate": 1.332985986274597e-06, + "loss": 0.74899995, + "num_input_tokens_seen": 111140950, + "step": 5156, + "time_per_iteration": 2.491074323654175 + }, + { + "auxiliary_loss_clip": 0.01095943, + "auxiliary_loss_mlp": 0.00762421, + "balance_loss_clip": 1.04548001, + "balance_loss_mlp": 1.00068665, + "epoch": 0.6200925870257921, + "flos": 12495190498560.0, + "grad_norm": 1.9978033146800827, + "language_loss": 0.75580359, + "learning_rate": 1.3322516632188047e-06, + "loss": 0.77438724, + "num_input_tokens_seen": 111157845, + "step": 5157, + "time_per_iteration": 2.5521092414855957 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.0455339, + "balance_loss_mlp": 1.01991391, + "epoch": 0.6202128299164312, + "flos": 26539350168960.0, + "grad_norm": 1.7963965760158007, + "language_loss": 0.6719082, + "learning_rate": 1.3315174414599045e-06, + "loss": 0.69347996, + "num_input_tokens_seen": 111179165, + "step": 5158, + "time_per_iteration": 2.5751895904541016 + }, + { + "auxiliary_loss_clip": 0.01156214, + "auxiliary_loss_mlp": 0.01024233, + "balance_loss_clip": 1.04734278, + "balance_loss_mlp": 1.01568592, + "epoch": 0.6203330728070703, + "flos": 18770508069120.0, + "grad_norm": 2.1245506726345535, + "language_loss": 0.75202847, + "learning_rate": 1.3307833211092768e-06, + "loss": 0.77383298, + "num_input_tokens_seen": 111197830, + "step": 5159, + "time_per_iteration": 2.4460315704345703 + }, + { + "auxiliary_loss_clip": 0.01177647, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.05416012, + "balance_loss_mlp": 1.02321851, + "epoch": 0.6204533156977093, + "flos": 20629782835200.0, + "grad_norm": 1.6359629451403146, + "language_loss": 0.75242364, + "learning_rate": 1.3300493022782873e-06, + "loss": 0.77451134, + "num_input_tokens_seen": 111218400, + "step": 5160, + "time_per_iteration": 2.437129020690918 + }, + { + "auxiliary_loss_clip": 0.01109571, + "auxiliary_loss_mlp": 0.00763293, + "balance_loss_clip": 1.04406416, + "balance_loss_mlp": 1.00076747, + "epoch": 0.6205735585883485, + "flos": 17348050598400.0, + "grad_norm": 1.789987078371122, + "language_loss": 0.72415805, + "learning_rate": 1.3293153850782855e-06, + "loss": 0.74288672, + "num_input_tokens_seen": 111236720, + "step": 5161, + "time_per_iteration": 2.529670000076294 + }, + { + "auxiliary_loss_clip": 0.01123569, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.04506731, + "balance_loss_mlp": 1.0192728, + "epoch": 0.6206938014789876, + "flos": 22965017742720.0, + "grad_norm": 2.394727344070964, + "language_loss": 0.71178114, + "learning_rate": 1.3285815696206069e-06, + "loss": 0.73330021, + "num_input_tokens_seen": 111258265, + "step": 5162, + "time_per_iteration": 3.2981951236724854 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.04450345, + "balance_loss_mlp": 1.02477264, + "epoch": 0.6208140443696266, + "flos": 23983192661760.0, + "grad_norm": 2.69615125241396, + "language_loss": 0.7683441, + "learning_rate": 1.32784785601657e-06, + "loss": 0.79000771, + "num_input_tokens_seen": 111277675, + "step": 5163, + "time_per_iteration": 2.548370599746704 + }, + { + "auxiliary_loss_clip": 0.01148839, + "auxiliary_loss_mlp": 0.01024809, + "balance_loss_clip": 1.0469408, + "balance_loss_mlp": 1.01693249, + "epoch": 0.6209342872602658, + "flos": 35077291303680.0, + "grad_norm": 1.7211830320276993, + "language_loss": 0.73686028, + "learning_rate": 1.3271142443774798e-06, + "loss": 0.75859672, + "num_input_tokens_seen": 111299910, + "step": 5164, + "time_per_iteration": 2.629284143447876 + }, + { + "auxiliary_loss_clip": 0.01144833, + "auxiliary_loss_mlp": 0.01022473, + "balance_loss_clip": 1.04928827, + "balance_loss_mlp": 1.01459968, + "epoch": 0.6210545301509048, + "flos": 26979327861120.0, + "grad_norm": 1.766327599393501, + "language_loss": 0.81522989, + "learning_rate": 1.3263807348146228e-06, + "loss": 0.83690292, + "num_input_tokens_seen": 111319765, + "step": 5165, + "time_per_iteration": 2.5535645484924316 + }, + { + "auxiliary_loss_clip": 0.0114433, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.04550493, + "balance_loss_mlp": 1.0276525, + "epoch": 0.6211747730415439, + "flos": 33618240852480.0, + "grad_norm": 2.192007811063716, + "language_loss": 0.73259747, + "learning_rate": 1.3256473274392733e-06, + "loss": 0.75440407, + "num_input_tokens_seen": 111341110, + "step": 5166, + "time_per_iteration": 3.4649343490600586 + }, + { + "auxiliary_loss_clip": 0.01174752, + "auxiliary_loss_mlp": 0.01029226, + "balance_loss_clip": 1.05107534, + "balance_loss_mlp": 1.02116108, + "epoch": 0.6212950159321831, + "flos": 34167099646080.0, + "grad_norm": 1.9716200427603439, + "language_loss": 0.69928497, + "learning_rate": 1.3249140223626873e-06, + "loss": 0.72132474, + "num_input_tokens_seen": 111362730, + "step": 5167, + "time_per_iteration": 3.3117260932922363 + }, + { + "auxiliary_loss_clip": 0.01159025, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.05048549, + "balance_loss_mlp": 1.01794565, + "epoch": 0.6214152588228221, + "flos": 27965758135680.0, + "grad_norm": 1.8686629737140226, + "language_loss": 0.75584412, + "learning_rate": 1.3241808196961077e-06, + "loss": 0.77769083, + "num_input_tokens_seen": 111383855, + "step": 5168, + "time_per_iteration": 2.499868392944336 + }, + { + "auxiliary_loss_clip": 0.01133113, + "auxiliary_loss_mlp": 0.01024034, + "balance_loss_clip": 1.04511786, + "balance_loss_mlp": 1.01642799, + "epoch": 0.6215355017134612, + "flos": 20230204965120.0, + "grad_norm": 1.7888571058056566, + "language_loss": 0.71031201, + "learning_rate": 1.3234477195507608e-06, + "loss": 0.73188341, + "num_input_tokens_seen": 111402685, + "step": 5169, + "time_per_iteration": 2.4794108867645264 + }, + { + "auxiliary_loss_clip": 0.01132301, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.04677236, + "balance_loss_mlp": 1.02076328, + "epoch": 0.6216557446041003, + "flos": 41428129219200.0, + "grad_norm": 2.3187113633964214, + "language_loss": 0.62189484, + "learning_rate": 1.322714722037857e-06, + "loss": 0.64349705, + "num_input_tokens_seen": 111424130, + "step": 5170, + "time_per_iteration": 3.42864727973938 + }, + { + "auxiliary_loss_clip": 0.01140212, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.04759312, + "balance_loss_mlp": 1.02231979, + "epoch": 0.6217759874947394, + "flos": 27928770105600.0, + "grad_norm": 2.0451137869749543, + "language_loss": 0.77578139, + "learning_rate": 1.321981827268591e-06, + "loss": 0.79748499, + "num_input_tokens_seen": 111444785, + "step": 5171, + "time_per_iteration": 2.5806195735931396 + }, + { + "auxiliary_loss_clip": 0.01150253, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.04932022, + "balance_loss_mlp": 1.02086735, + "epoch": 0.6218962303853784, + "flos": 21765673601280.0, + "grad_norm": 1.697017083153132, + "language_loss": 0.81457758, + "learning_rate": 1.3212490353541426e-06, + "loss": 0.83636785, + "num_input_tokens_seen": 111467045, + "step": 5172, + "time_per_iteration": 2.5358362197875977 + }, + { + "auxiliary_loss_clip": 0.01176365, + "auxiliary_loss_mlp": 0.01024585, + "balance_loss_clip": 1.05082786, + "balance_loss_mlp": 1.01658583, + "epoch": 0.6220164732760175, + "flos": 21246260981760.0, + "grad_norm": 1.9215630578600953, + "language_loss": 0.80180526, + "learning_rate": 1.3205163464056762e-06, + "loss": 0.82381475, + "num_input_tokens_seen": 111483650, + "step": 5173, + "time_per_iteration": 2.428027868270874 + }, + { + "auxiliary_loss_clip": 0.01158613, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.04830134, + "balance_loss_mlp": 1.0210371, + "epoch": 0.6221367161666567, + "flos": 26136360506880.0, + "grad_norm": 1.883102375572845, + "language_loss": 0.72920758, + "learning_rate": 1.319783760534339e-06, + "loss": 0.75108123, + "num_input_tokens_seen": 111502895, + "step": 5174, + "time_per_iteration": 2.4963786602020264 + }, + { + "auxiliary_loss_clip": 0.01161858, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.052176, + "balance_loss_mlp": 1.02232587, + "epoch": 0.6222569590572957, + "flos": 16284196558080.0, + "grad_norm": 2.248326752209355, + "language_loss": 0.75213224, + "learning_rate": 1.319051277851266e-06, + "loss": 0.77405655, + "num_input_tokens_seen": 111519180, + "step": 5175, + "time_per_iteration": 2.431265115737915 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.04964113, + "balance_loss_mlp": 1.02309537, + "epoch": 0.6223772019479348, + "flos": 18223840005120.0, + "grad_norm": 1.8167662182747357, + "language_loss": 0.842345, + "learning_rate": 1.3183188984675716e-06, + "loss": 0.86428225, + "num_input_tokens_seen": 111537545, + "step": 5176, + "time_per_iteration": 2.429539203643799 + }, + { + "auxiliary_loss_clip": 0.0114653, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.05012107, + "balance_loss_mlp": 1.02555668, + "epoch": 0.6224974448385739, + "flos": 27489797994240.0, + "grad_norm": 2.4275330939966455, + "language_loss": 0.71225476, + "learning_rate": 1.3175866224943586e-06, + "loss": 0.73405015, + "num_input_tokens_seen": 111556265, + "step": 5177, + "time_per_iteration": 2.5202741622924805 + }, + { + "auxiliary_loss_clip": 0.01150551, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.04937506, + "balance_loss_mlp": 1.02128005, + "epoch": 0.622617687729213, + "flos": 19791951125760.0, + "grad_norm": 2.5902245639624097, + "language_loss": 0.73468375, + "learning_rate": 1.316854450042712e-06, + "loss": 0.75648284, + "num_input_tokens_seen": 111574205, + "step": 5178, + "time_per_iteration": 2.468618392944336 + }, + { + "auxiliary_loss_clip": 0.01166919, + "auxiliary_loss_mlp": 0.01025371, + "balance_loss_clip": 1.05129933, + "balance_loss_mlp": 1.01751196, + "epoch": 0.622737930619852, + "flos": 23038886062080.0, + "grad_norm": 1.9047970956836242, + "language_loss": 0.74362719, + "learning_rate": 1.3161223812237024e-06, + "loss": 0.76555008, + "num_input_tokens_seen": 111593560, + "step": 5179, + "time_per_iteration": 2.4668078422546387 + }, + { + "auxiliary_loss_clip": 0.01174002, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.04919124, + "balance_loss_mlp": 1.02327764, + "epoch": 0.6228581735104912, + "flos": 12634271959680.0, + "grad_norm": 2.937125254931495, + "language_loss": 0.84914672, + "learning_rate": 1.3153904161483842e-06, + "loss": 0.87119859, + "num_input_tokens_seen": 111608860, + "step": 5180, + "time_per_iteration": 2.3765153884887695 + }, + { + "auxiliary_loss_clip": 0.01127725, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.04467797, + "balance_loss_mlp": 1.01800966, + "epoch": 0.6229784164011303, + "flos": 23802813538560.0, + "grad_norm": 2.0340228026532836, + "language_loss": 0.85037589, + "learning_rate": 1.3146585549277953e-06, + "loss": 0.87191749, + "num_input_tokens_seen": 111627500, + "step": 5181, + "time_per_iteration": 2.5239310264587402 + }, + { + "auxiliary_loss_clip": 0.01157476, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.05071926, + "balance_loss_mlp": 1.02495217, + "epoch": 0.6230986592917693, + "flos": 22414219614720.0, + "grad_norm": 2.0717912476862907, + "language_loss": 0.78470969, + "learning_rate": 1.3139267976729591e-06, + "loss": 0.80661196, + "num_input_tokens_seen": 111647690, + "step": 5182, + "time_per_iteration": 2.4967501163482666 + }, + { + "auxiliary_loss_clip": 0.01165023, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.05192065, + "balance_loss_mlp": 1.02140307, + "epoch": 0.6232189021824085, + "flos": 34528217028480.0, + "grad_norm": 1.642552819933219, + "language_loss": 0.71445298, + "learning_rate": 1.3131951444948815e-06, + "loss": 0.73640186, + "num_input_tokens_seen": 111667090, + "step": 5183, + "time_per_iteration": 2.5647177696228027 + }, + { + "auxiliary_loss_clip": 0.01149273, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.04965508, + "balance_loss_mlp": 1.02431846, + "epoch": 0.6233391450730476, + "flos": 22237000888320.0, + "grad_norm": 2.1184798783130776, + "language_loss": 0.76420403, + "learning_rate": 1.3124635955045546e-06, + "loss": 0.78602004, + "num_input_tokens_seen": 111686905, + "step": 5184, + "time_per_iteration": 2.4844810962677 + }, + { + "auxiliary_loss_clip": 0.01107052, + "auxiliary_loss_mlp": 0.00763131, + "balance_loss_clip": 1.04104447, + "balance_loss_mlp": 1.00061691, + "epoch": 0.6234593879636866, + "flos": 20332693445760.0, + "grad_norm": 1.832222907773644, + "language_loss": 0.83983433, + "learning_rate": 1.3117321508129537e-06, + "loss": 0.85853612, + "num_input_tokens_seen": 111704985, + "step": 5185, + "time_per_iteration": 2.5574467182159424 + }, + { + "auxiliary_loss_clip": 0.0114947, + "auxiliary_loss_mlp": 0.01022647, + "balance_loss_clip": 1.04886055, + "balance_loss_mlp": 1.01532722, + "epoch": 0.6235796308543258, + "flos": 20664903358080.0, + "grad_norm": 1.5358608182006221, + "language_loss": 0.76228631, + "learning_rate": 1.3110008105310388e-06, + "loss": 0.78400743, + "num_input_tokens_seen": 111724805, + "step": 5186, + "time_per_iteration": 2.502291440963745 + }, + { + "auxiliary_loss_clip": 0.01176002, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.0499531, + "balance_loss_mlp": 1.02547073, + "epoch": 0.6236998737449648, + "flos": 26618641441920.0, + "grad_norm": 1.620482699143074, + "language_loss": 0.78040814, + "learning_rate": 1.3102695747697526e-06, + "loss": 0.80250406, + "num_input_tokens_seen": 111747675, + "step": 5187, + "time_per_iteration": 2.491316318511963 + }, + { + "auxiliary_loss_clip": 0.01104596, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.04617989, + "balance_loss_mlp": 1.01916695, + "epoch": 0.6238201166356039, + "flos": 12674599954560.0, + "grad_norm": 3.164098414209472, + "language_loss": 0.90217841, + "learning_rate": 1.3095384436400237e-06, + "loss": 0.92349803, + "num_input_tokens_seen": 111759205, + "step": 5188, + "time_per_iteration": 2.5018911361694336 + }, + { + "auxiliary_loss_clip": 0.01153973, + "auxiliary_loss_mlp": 0.01023299, + "balance_loss_clip": 1.04810059, + "balance_loss_mlp": 1.01557386, + "epoch": 0.623940359526243, + "flos": 10452160730880.0, + "grad_norm": 2.1649389303604223, + "language_loss": 0.81824613, + "learning_rate": 1.3088074172527633e-06, + "loss": 0.84001887, + "num_input_tokens_seen": 111776335, + "step": 5189, + "time_per_iteration": 3.333324432373047 + }, + { + "auxiliary_loss_clip": 0.01148948, + "auxiliary_loss_mlp": 0.01023697, + "balance_loss_clip": 1.04586041, + "balance_loss_mlp": 1.0154295, + "epoch": 0.6240606024168821, + "flos": 29059525226880.0, + "grad_norm": 1.7440212301044016, + "language_loss": 0.71493834, + "learning_rate": 1.3080764957188684e-06, + "loss": 0.73666477, + "num_input_tokens_seen": 111796580, + "step": 5190, + "time_per_iteration": 2.5504536628723145 + }, + { + "auxiliary_loss_clip": 0.01122099, + "auxiliary_loss_mlp": 0.0102464, + "balance_loss_clip": 1.04511976, + "balance_loss_mlp": 1.01633668, + "epoch": 0.6241808453075212, + "flos": 22018089450240.0, + "grad_norm": 1.7427760892535424, + "language_loss": 0.70954829, + "learning_rate": 1.3073456791492192e-06, + "loss": 0.73101568, + "num_input_tokens_seen": 111816290, + "step": 5191, + "time_per_iteration": 2.557647943496704 + }, + { + "auxiliary_loss_clip": 0.01148211, + "auxiliary_loss_mlp": 0.01025333, + "balance_loss_clip": 1.04606092, + "balance_loss_mlp": 1.01770949, + "epoch": 0.6243010881981603, + "flos": 21138708683520.0, + "grad_norm": 1.7562718852180146, + "language_loss": 0.78606039, + "learning_rate": 1.3066149676546801e-06, + "loss": 0.80779582, + "num_input_tokens_seen": 111834470, + "step": 5192, + "time_per_iteration": 2.4930386543273926 + }, + { + "auxiliary_loss_clip": 0.01148658, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.05349004, + "balance_loss_mlp": 1.02068949, + "epoch": 0.6244213310887994, + "flos": 22344948236160.0, + "grad_norm": 1.6920099490156852, + "language_loss": 0.6629023, + "learning_rate": 1.3058843613460985e-06, + "loss": 0.68467641, + "num_input_tokens_seen": 111852410, + "step": 5193, + "time_per_iteration": 3.344407081604004 + }, + { + "auxiliary_loss_clip": 0.01140214, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.04753375, + "balance_loss_mlp": 1.01972127, + "epoch": 0.6245415739794384, + "flos": 15231978524160.0, + "grad_norm": 1.9263940141314557, + "language_loss": 0.74089432, + "learning_rate": 1.3051538603343075e-06, + "loss": 0.76257157, + "num_input_tokens_seen": 111870340, + "step": 5194, + "time_per_iteration": 3.365662097930908 + }, + { + "auxiliary_loss_clip": 0.01161555, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.0514853, + "balance_loss_mlp": 1.02819633, + "epoch": 0.6246618168700776, + "flos": 18879891960960.0, + "grad_norm": 1.9666510575706388, + "language_loss": 0.67418522, + "learning_rate": 1.3044234647301235e-06, + "loss": 0.69616175, + "num_input_tokens_seen": 111888365, + "step": 5195, + "time_per_iteration": 2.442220449447632 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.04876482, + "balance_loss_mlp": 1.01986647, + "epoch": 0.6247820597607167, + "flos": 14319201087360.0, + "grad_norm": 1.9101932011054747, + "language_loss": 0.72601497, + "learning_rate": 1.303693174644347e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 111905840, + "step": 5196, + "time_per_iteration": 3.16748046875 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.04643488, + "balance_loss_mlp": 1.02220666, + "epoch": 0.6249023026513557, + "flos": 22637979388800.0, + "grad_norm": 2.0862632571542226, + "language_loss": 0.80467904, + "learning_rate": 1.3029629901877625e-06, + "loss": 0.82641375, + "num_input_tokens_seen": 111925215, + "step": 5197, + "time_per_iteration": 2.4859719276428223 + }, + { + "auxiliary_loss_clip": 0.01168862, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.05253983, + "balance_loss_mlp": 1.01925683, + "epoch": 0.6250225455419949, + "flos": 20266690204800.0, + "grad_norm": 3.0725458616635564, + "language_loss": 0.77077407, + "learning_rate": 1.3022329114711376e-06, + "loss": 0.79273844, + "num_input_tokens_seen": 111943925, + "step": 5198, + "time_per_iteration": 2.4443161487579346 + }, + { + "auxiliary_loss_clip": 0.01143222, + "auxiliary_loss_mlp": 0.0102402, + "balance_loss_clip": 1.04749584, + "balance_loss_mlp": 1.016361, + "epoch": 0.6251427884326339, + "flos": 23437853400960.0, + "grad_norm": 2.4790473496002225, + "language_loss": 0.69548547, + "learning_rate": 1.3015029386052256e-06, + "loss": 0.71715784, + "num_input_tokens_seen": 111964095, + "step": 5199, + "time_per_iteration": 2.512484550476074 + }, + { + "auxiliary_loss_clip": 0.01144577, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.04931223, + "balance_loss_mlp": 1.02269578, + "epoch": 0.625263031323273, + "flos": 31723055464320.0, + "grad_norm": 2.127825540730026, + "language_loss": 0.72924012, + "learning_rate": 1.3007730717007622e-06, + "loss": 0.75099194, + "num_input_tokens_seen": 111984910, + "step": 5200, + "time_per_iteration": 2.603738784790039 + }, + { + "auxiliary_loss_clip": 0.01177455, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.05234516, + "balance_loss_mlp": 1.02088344, + "epoch": 0.6253832742139122, + "flos": 24134341092480.0, + "grad_norm": 1.7274660524267338, + "language_loss": 0.75600898, + "learning_rate": 1.3000433108684676e-06, + "loss": 0.77807534, + "num_input_tokens_seen": 112005410, + "step": 5201, + "time_per_iteration": 2.450544834136963 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01022565, + "balance_loss_clip": 1.04908442, + "balance_loss_mlp": 1.01484585, + "epoch": 0.6255035171045512, + "flos": 27668812400640.0, + "grad_norm": 9.456925047273225, + "language_loss": 0.8065123, + "learning_rate": 1.2993136562190467e-06, + "loss": 0.82831097, + "num_input_tokens_seen": 112024530, + "step": 5202, + "time_per_iteration": 2.4952661991119385 + }, + { + "auxiliary_loss_clip": 0.01150399, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.04889143, + "balance_loss_mlp": 1.01988471, + "epoch": 0.6256237599951903, + "flos": 20227798753920.0, + "grad_norm": 1.5608837175036179, + "language_loss": 0.70270973, + "learning_rate": 1.2985841078631871e-06, + "loss": 0.72449446, + "num_input_tokens_seen": 112043850, + "step": 5203, + "time_per_iteration": 2.4833059310913086 + }, + { + "auxiliary_loss_clip": 0.01099335, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.03912997, + "balance_loss_mlp": 1.02352262, + "epoch": 0.6257440028858293, + "flos": 24170574936960.0, + "grad_norm": 1.8296164103154822, + "language_loss": 0.78451347, + "learning_rate": 1.2978546659115608e-06, + "loss": 0.80582213, + "num_input_tokens_seen": 112061930, + "step": 5204, + "time_per_iteration": 2.5853633880615234 + }, + { + "auxiliary_loss_clip": 0.01149213, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.0490036, + "balance_loss_mlp": 1.02269483, + "epoch": 0.6258642457764685, + "flos": 15851940289920.0, + "grad_norm": 1.9241035156068131, + "language_loss": 0.85477406, + "learning_rate": 1.2971253304748228e-06, + "loss": 0.87657118, + "num_input_tokens_seen": 112079645, + "step": 5205, + "time_per_iteration": 2.4536120891571045 + }, + { + "auxiliary_loss_clip": 0.01166588, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.05259645, + "balance_loss_mlp": 1.02108145, + "epoch": 0.6259844886671075, + "flos": 11911354836480.0, + "grad_norm": 1.588159424865464, + "language_loss": 0.75170666, + "learning_rate": 1.296396101663614e-06, + "loss": 0.7736634, + "num_input_tokens_seen": 112096205, + "step": 5206, + "time_per_iteration": 2.429076671600342 + }, + { + "auxiliary_loss_clip": 0.01164154, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.05111194, + "balance_loss_mlp": 1.01894987, + "epoch": 0.6261047315577466, + "flos": 15887958652800.0, + "grad_norm": 3.574489457448891, + "language_loss": 0.8394323, + "learning_rate": 1.2956669795885565e-06, + "loss": 0.86134183, + "num_input_tokens_seen": 112112835, + "step": 5207, + "time_per_iteration": 2.4251773357391357 + }, + { + "auxiliary_loss_clip": 0.01127823, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.04628062, + "balance_loss_mlp": 1.0260694, + "epoch": 0.6262249744483858, + "flos": 31248926916480.0, + "grad_norm": 2.1633989928415707, + "language_loss": 0.6804316, + "learning_rate": 1.294937964360259e-06, + "loss": 0.702052, + "num_input_tokens_seen": 112133105, + "step": 5208, + "time_per_iteration": 2.5997345447540283 + }, + { + "auxiliary_loss_clip": 0.01152985, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.04851007, + "balance_loss_mlp": 1.02177668, + "epoch": 0.6263452173390248, + "flos": 27198598435200.0, + "grad_norm": 2.202788255060346, + "language_loss": 0.71013522, + "learning_rate": 1.2942090560893108e-06, + "loss": 0.73197389, + "num_input_tokens_seen": 112152510, + "step": 5209, + "time_per_iteration": 2.5261213779449463 + }, + { + "auxiliary_loss_clip": 0.01173905, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.05083227, + "balance_loss_mlp": 1.01739907, + "epoch": 0.6264654602296639, + "flos": 37342069683840.0, + "grad_norm": 2.0449967530183635, + "language_loss": 0.60759819, + "learning_rate": 1.2934802548862882e-06, + "loss": 0.62958395, + "num_input_tokens_seen": 112175295, + "step": 5210, + "time_per_iteration": 2.5536608695983887 + }, + { + "auxiliary_loss_clip": 0.01142308, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.04583013, + "balance_loss_mlp": 1.02179551, + "epoch": 0.626585703120303, + "flos": 14756952136320.0, + "grad_norm": 1.7741220645081541, + "language_loss": 0.83058244, + "learning_rate": 1.292751560861749e-06, + "loss": 0.85229892, + "num_input_tokens_seen": 112190200, + "step": 5211, + "time_per_iteration": 2.4346988201141357 + }, + { + "auxiliary_loss_clip": 0.01175801, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.0510031, + "balance_loss_mlp": 1.02000999, + "epoch": 0.6267059460109421, + "flos": 22347318533760.0, + "grad_norm": 1.8700318560239715, + "language_loss": 0.79694223, + "learning_rate": 1.2920229741262354e-06, + "loss": 0.81898469, + "num_input_tokens_seen": 112208205, + "step": 5212, + "time_per_iteration": 2.4382312297821045 + }, + { + "auxiliary_loss_clip": 0.01147328, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.04774451, + "balance_loss_mlp": 1.01924598, + "epoch": 0.6268261889015811, + "flos": 17748813617280.0, + "grad_norm": 2.7344753534121433, + "language_loss": 0.75384033, + "learning_rate": 1.2912944947902739e-06, + "loss": 0.77558255, + "num_input_tokens_seen": 112224690, + "step": 5213, + "time_per_iteration": 2.4599359035491943 + }, + { + "auxiliary_loss_clip": 0.01152422, + "auxiliary_loss_mlp": 0.01024836, + "balance_loss_clip": 1.0487715, + "balance_loss_mlp": 1.01639569, + "epoch": 0.6269464317922203, + "flos": 32846484211200.0, + "grad_norm": 1.9549116769146608, + "language_loss": 0.71654594, + "learning_rate": 1.2905661229643742e-06, + "loss": 0.73831856, + "num_input_tokens_seen": 112244450, + "step": 5214, + "time_per_iteration": 2.630216360092163 + }, + { + "auxiliary_loss_clip": 0.01176062, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.05081916, + "balance_loss_mlp": 1.02159548, + "epoch": 0.6270666746828594, + "flos": 17929192740480.0, + "grad_norm": 2.2214229842556663, + "language_loss": 0.84309161, + "learning_rate": 1.2898378587590299e-06, + "loss": 0.86514866, + "num_input_tokens_seen": 112261050, + "step": 5215, + "time_per_iteration": 3.1176860332489014 + }, + { + "auxiliary_loss_clip": 0.0115838, + "auxiliary_loss_mlp": 0.0102373, + "balance_loss_clip": 1.0502367, + "balance_loss_mlp": 1.01613069, + "epoch": 0.6271869175734984, + "flos": 17457326749440.0, + "grad_norm": 1.8758552555966743, + "language_loss": 0.87420404, + "learning_rate": 1.2891097022847173e-06, + "loss": 0.89602506, + "num_input_tokens_seen": 112278395, + "step": 5216, + "time_per_iteration": 2.455867290496826 + }, + { + "auxiliary_loss_clip": 0.01146082, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.0476253, + "balance_loss_mlp": 1.0246172, + "epoch": 0.6273071604641376, + "flos": 26868615166080.0, + "grad_norm": 2.3621541623132916, + "language_loss": 0.66724181, + "learning_rate": 1.2883816536518978e-06, + "loss": 0.68903726, + "num_input_tokens_seen": 112299535, + "step": 5217, + "time_per_iteration": 2.5225913524627686 + }, + { + "auxiliary_loss_clip": 0.0115751, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.04755771, + "balance_loss_mlp": 1.01952553, + "epoch": 0.6274274033547766, + "flos": 26062384446720.0, + "grad_norm": 1.71003600763185, + "language_loss": 0.81872934, + "learning_rate": 1.2876537129710155e-06, + "loss": 0.84057128, + "num_input_tokens_seen": 112317265, + "step": 5218, + "time_per_iteration": 2.475831985473633 + }, + { + "auxiliary_loss_clip": 0.0114661, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.05319691, + "balance_loss_mlp": 1.02334619, + "epoch": 0.6275476462454157, + "flos": 20266259241600.0, + "grad_norm": 1.970132464259165, + "language_loss": 0.7524184, + "learning_rate": 1.286925880352499e-06, + "loss": 0.77419591, + "num_input_tokens_seen": 112336125, + "step": 5219, + "time_per_iteration": 2.473417043685913 + }, + { + "auxiliary_loss_clip": 0.01145305, + "auxiliary_loss_mlp": 0.01021292, + "balance_loss_clip": 1.04897714, + "balance_loss_mlp": 1.0139488, + "epoch": 0.6276678891360549, + "flos": 26320402817280.0, + "grad_norm": 1.7128010599842847, + "language_loss": 0.71219057, + "learning_rate": 1.2861981559067592e-06, + "loss": 0.7338565, + "num_input_tokens_seen": 112356730, + "step": 5220, + "time_per_iteration": 3.3524868488311768 + }, + { + "auxiliary_loss_clip": 0.0110738, + "auxiliary_loss_mlp": 0.01024401, + "balance_loss_clip": 1.04274035, + "balance_loss_mlp": 1.0168134, + "epoch": 0.6277881320266939, + "flos": 13912512324480.0, + "grad_norm": 2.0379167900975714, + "language_loss": 0.80072564, + "learning_rate": 1.2854705397441917e-06, + "loss": 0.82204348, + "num_input_tokens_seen": 112372270, + "step": 5221, + "time_per_iteration": 3.384284734725952 + }, + { + "auxiliary_loss_clip": 0.01125709, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.04305279, + "balance_loss_mlp": 1.01877129, + "epoch": 0.627908374917333, + "flos": 27048922462080.0, + "grad_norm": 2.2332600876185844, + "language_loss": 0.7759856, + "learning_rate": 1.2847430319751747e-06, + "loss": 0.79750645, + "num_input_tokens_seen": 112390365, + "step": 5222, + "time_per_iteration": 2.5690886974334717 + }, + { + "auxiliary_loss_clip": 0.01155425, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.05053329, + "balance_loss_mlp": 1.0174545, + "epoch": 0.6280286178079721, + "flos": 23769201386880.0, + "grad_norm": 2.288177421563343, + "language_loss": 0.67311609, + "learning_rate": 1.2840156327100712e-06, + "loss": 0.69491774, + "num_input_tokens_seen": 112407490, + "step": 5223, + "time_per_iteration": 3.172185182571411 + }, + { + "auxiliary_loss_clip": 0.01172709, + "auxiliary_loss_mlp": 0.01023958, + "balance_loss_clip": 1.05046487, + "balance_loss_mlp": 1.01622117, + "epoch": 0.6281488606986112, + "flos": 26359150613760.0, + "grad_norm": 1.8129402269948693, + "language_loss": 0.72260547, + "learning_rate": 1.2832883420592272e-06, + "loss": 0.7445721, + "num_input_tokens_seen": 112426385, + "step": 5224, + "time_per_iteration": 2.4578473567962646 + }, + { + "auxiliary_loss_clip": 0.01142726, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.0477531, + "balance_loss_mlp": 1.02057946, + "epoch": 0.6282691035892503, + "flos": 36137194848000.0, + "grad_norm": 3.5637560289815746, + "language_loss": 0.64467061, + "learning_rate": 1.282561160132972e-06, + "loss": 0.66638285, + "num_input_tokens_seen": 112446905, + "step": 5225, + "time_per_iteration": 2.6068241596221924 + }, + { + "auxiliary_loss_clip": 0.01151054, + "auxiliary_loss_mlp": 0.01037466, + "balance_loss_clip": 1.04585648, + "balance_loss_mlp": 1.02968717, + "epoch": 0.6283893464798894, + "flos": 26537231266560.0, + "grad_norm": 1.9507870436213395, + "language_loss": 0.80939525, + "learning_rate": 1.2818340870416186e-06, + "loss": 0.83128047, + "num_input_tokens_seen": 112468040, + "step": 5226, + "time_per_iteration": 2.5375113487243652 + }, + { + "auxiliary_loss_clip": 0.01138921, + "auxiliary_loss_mlp": 0.01029503, + "balance_loss_clip": 1.04562402, + "balance_loss_mlp": 1.02097321, + "epoch": 0.6285095893705285, + "flos": 22237216369920.0, + "grad_norm": 2.411895185398057, + "language_loss": 0.75849283, + "learning_rate": 1.2811071228954626e-06, + "loss": 0.780177, + "num_input_tokens_seen": 112486675, + "step": 5227, + "time_per_iteration": 2.5346691608428955 + }, + { + "auxiliary_loss_clip": 0.01146403, + "auxiliary_loss_mlp": 0.01026043, + "balance_loss_clip": 1.04913485, + "balance_loss_mlp": 1.01816332, + "epoch": 0.6286298322611675, + "flos": 26542259170560.0, + "grad_norm": 2.116746544027814, + "language_loss": 0.81267941, + "learning_rate": 1.2803802678047846e-06, + "loss": 0.83440387, + "num_input_tokens_seen": 112506825, + "step": 5228, + "time_per_iteration": 2.5236423015594482 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01035118, + "balance_loss_clip": 1.04968143, + "balance_loss_mlp": 1.02688074, + "epoch": 0.6287500751518067, + "flos": 21795227516160.0, + "grad_norm": 1.9220122988730943, + "language_loss": 0.74154377, + "learning_rate": 1.279653521879848e-06, + "loss": 0.76341105, + "num_input_tokens_seen": 112526890, + "step": 5229, + "time_per_iteration": 2.489473819732666 + }, + { + "auxiliary_loss_clip": 0.01080833, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.03982747, + "balance_loss_mlp": 1.01892579, + "epoch": 0.6288703180424458, + "flos": 20009605587840.0, + "grad_norm": 1.9858378219753012, + "language_loss": 0.83831459, + "learning_rate": 1.2789268852308997e-06, + "loss": 0.85938418, + "num_input_tokens_seen": 112542100, + "step": 5230, + "time_per_iteration": 2.578523874282837 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.04742026, + "balance_loss_mlp": 1.02191186, + "epoch": 0.6289905609330848, + "flos": 22124923476480.0, + "grad_norm": 2.1981195811225214, + "language_loss": 0.71028095, + "learning_rate": 1.2782003579681688e-06, + "loss": 0.73210454, + "num_input_tokens_seen": 112561630, + "step": 5231, + "time_per_iteration": 2.470848798751831 + }, + { + "auxiliary_loss_clip": 0.01177175, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.05290663, + "balance_loss_mlp": 1.02168894, + "epoch": 0.629110803823724, + "flos": 25518481729920.0, + "grad_norm": 1.5388962269697406, + "language_loss": 0.74302125, + "learning_rate": 1.2774739402018701e-06, + "loss": 0.76509148, + "num_input_tokens_seen": 112582465, + "step": 5232, + "time_per_iteration": 2.4598615169525146 + }, + { + "auxiliary_loss_clip": 0.01162717, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.05331588, + "balance_loss_mlp": 1.02197623, + "epoch": 0.629231046714363, + "flos": 20886616056960.0, + "grad_norm": 2.1229375345960295, + "language_loss": 0.73529476, + "learning_rate": 1.2767476320422002e-06, + "loss": 0.75722539, + "num_input_tokens_seen": 112602390, + "step": 5233, + "time_per_iteration": 2.467738628387451 + }, + { + "auxiliary_loss_clip": 0.01048812, + "auxiliary_loss_mlp": 0.01001011, + "balance_loss_clip": 1.01820767, + "balance_loss_mlp": 1.00014102, + "epoch": 0.6293512896050021, + "flos": 65050027908480.0, + "grad_norm": 0.6788227917817856, + "language_loss": 0.57197392, + "learning_rate": 1.2760214335993392e-06, + "loss": 0.59247214, + "num_input_tokens_seen": 112669035, + "step": 5234, + "time_per_iteration": 3.1789321899414062 + }, + { + "auxiliary_loss_clip": 0.01152627, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.04704154, + "balance_loss_mlp": 1.01815128, + "epoch": 0.6294715324956413, + "flos": 34677857088000.0, + "grad_norm": 1.9001739370799604, + "language_loss": 0.59017414, + "learning_rate": 1.2752953449834514e-06, + "loss": 0.61195463, + "num_input_tokens_seen": 112691485, + "step": 5235, + "time_per_iteration": 2.579751968383789 + }, + { + "auxiliary_loss_clip": 0.01175512, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.05241036, + "balance_loss_mlp": 1.02036381, + "epoch": 0.6295917753862803, + "flos": 22784207656320.0, + "grad_norm": 1.5564747156146077, + "language_loss": 0.7999087, + "learning_rate": 1.2745693663046836e-06, + "loss": 0.82194149, + "num_input_tokens_seen": 112710555, + "step": 5236, + "time_per_iteration": 2.432197093963623 + }, + { + "auxiliary_loss_clip": 0.01154967, + "auxiliary_loss_mlp": 0.01023839, + "balance_loss_clip": 1.04776597, + "balance_loss_mlp": 1.01674283, + "epoch": 0.6297120182769194, + "flos": 20850454039680.0, + "grad_norm": 2.415094296415789, + "language_loss": 0.80901217, + "learning_rate": 1.2738434976731662e-06, + "loss": 0.83080018, + "num_input_tokens_seen": 112728740, + "step": 5237, + "time_per_iteration": 2.446061611175537 + }, + { + "auxiliary_loss_clip": 0.01147329, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.04914629, + "balance_loss_mlp": 1.028023, + "epoch": 0.6298322611675584, + "flos": 19497662997120.0, + "grad_norm": 1.511658108858409, + "language_loss": 0.75341427, + "learning_rate": 1.2731177391990125e-06, + "loss": 0.77525342, + "num_input_tokens_seen": 112748665, + "step": 5238, + "time_per_iteration": 2.495856285095215 + }, + { + "auxiliary_loss_clip": 0.01146229, + "auxiliary_loss_mlp": 0.01024142, + "balance_loss_clip": 1.04514074, + "balance_loss_mlp": 1.01671517, + "epoch": 0.6299525040581976, + "flos": 12604466649600.0, + "grad_norm": 1.9377019127844985, + "language_loss": 0.81650442, + "learning_rate": 1.2723920909923203e-06, + "loss": 0.83820814, + "num_input_tokens_seen": 112764410, + "step": 5239, + "time_per_iteration": 2.4648613929748535 + }, + { + "auxiliary_loss_clip": 0.01074429, + "auxiliary_loss_mlp": 0.01004566, + "balance_loss_clip": 1.0169102, + "balance_loss_mlp": 1.00367188, + "epoch": 0.6300727469488366, + "flos": 57725685636480.0, + "grad_norm": 0.8612994398266367, + "language_loss": 0.60413402, + "learning_rate": 1.2716665531631688e-06, + "loss": 0.62492394, + "num_input_tokens_seen": 112818695, + "step": 5240, + "time_per_iteration": 2.967538595199585 + }, + { + "auxiliary_loss_clip": 0.01164447, + "auxiliary_loss_mlp": 0.01023709, + "balance_loss_clip": 1.04732883, + "balance_loss_mlp": 1.01520991, + "epoch": 0.6301929898394757, + "flos": 22527302607360.0, + "grad_norm": 1.7266647771960117, + "language_loss": 0.77568197, + "learning_rate": 1.270941125821623e-06, + "loss": 0.79756355, + "num_input_tokens_seen": 112839120, + "step": 5241, + "time_per_iteration": 2.4679481983184814 + }, + { + "auxiliary_loss_clip": 0.01152948, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.0445534, + "balance_loss_mlp": 1.02295089, + "epoch": 0.6303132327301149, + "flos": 28293550675200.0, + "grad_norm": 1.7122368213958923, + "language_loss": 0.75129187, + "learning_rate": 1.2702158090777278e-06, + "loss": 0.77312946, + "num_input_tokens_seen": 112860210, + "step": 5242, + "time_per_iteration": 3.3576436042785645 + }, + { + "auxiliary_loss_clip": 0.01127615, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.0440861, + "balance_loss_mlp": 1.01821852, + "epoch": 0.6304334756207539, + "flos": 25264521596160.0, + "grad_norm": 1.906634468510918, + "language_loss": 0.74925578, + "learning_rate": 1.2694906030415148e-06, + "loss": 0.77079117, + "num_input_tokens_seen": 112877955, + "step": 5243, + "time_per_iteration": 2.5397820472717285 + }, + { + "auxiliary_loss_clip": 0.01154239, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.04855275, + "balance_loss_mlp": 1.02098536, + "epoch": 0.630553718511393, + "flos": 18033548728320.0, + "grad_norm": 2.7027013290048454, + "language_loss": 0.82178998, + "learning_rate": 1.2687655078229958e-06, + "loss": 0.84362197, + "num_input_tokens_seen": 112892285, + "step": 5244, + "time_per_iteration": 2.6105520725250244 + }, + { + "auxiliary_loss_clip": 0.01143502, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.04837537, + "balance_loss_mlp": 1.01791167, + "epoch": 0.6306739614020321, + "flos": 27304103658240.0, + "grad_norm": 2.2809779005503863, + "language_loss": 0.68951988, + "learning_rate": 1.2680405235321678e-06, + "loss": 0.71121031, + "num_input_tokens_seen": 112913620, + "step": 5245, + "time_per_iteration": 2.5363271236419678 + }, + { + "auxiliary_loss_clip": 0.01148288, + "auxiliary_loss_mlp": 0.00762787, + "balance_loss_clip": 1.05169559, + "balance_loss_mlp": 1.0003823, + "epoch": 0.6307942042926712, + "flos": 15341434243200.0, + "grad_norm": 2.0723166847212067, + "language_loss": 0.78863728, + "learning_rate": 1.267315650279011e-06, + "loss": 0.80774796, + "num_input_tokens_seen": 112932090, + "step": 5246, + "time_per_iteration": 3.3224740028381348 + }, + { + "auxiliary_loss_clip": 0.01126687, + "auxiliary_loss_mlp": 0.01024207, + "balance_loss_clip": 1.04773426, + "balance_loss_mlp": 1.01682222, + "epoch": 0.6309144471833102, + "flos": 19606400444160.0, + "grad_norm": 1.7387648797215294, + "language_loss": 0.73967999, + "learning_rate": 1.2665908881734874e-06, + "loss": 0.76118898, + "num_input_tokens_seen": 112950925, + "step": 5247, + "time_per_iteration": 3.412727117538452 + }, + { + "auxiliary_loss_clip": 0.01159651, + "auxiliary_loss_mlp": 0.01025458, + "balance_loss_clip": 1.04971337, + "balance_loss_mlp": 1.01848137, + "epoch": 0.6310346900739494, + "flos": 17493345112320.0, + "grad_norm": 2.1975669661306965, + "language_loss": 0.84660661, + "learning_rate": 1.2658662373255432e-06, + "loss": 0.86845767, + "num_input_tokens_seen": 112969315, + "step": 5248, + "time_per_iteration": 2.5256261825561523 + }, + { + "auxiliary_loss_clip": 0.01053738, + "auxiliary_loss_mlp": 0.01002163, + "balance_loss_clip": 1.01614547, + "balance_loss_mlp": 1.00128686, + "epoch": 0.6311549329645885, + "flos": 55070164131840.0, + "grad_norm": 0.7103915134079878, + "language_loss": 0.52294707, + "learning_rate": 1.2651416978451063e-06, + "loss": 0.54350609, + "num_input_tokens_seen": 113034700, + "step": 5249, + "time_per_iteration": 3.1693356037139893 + }, + { + "auxiliary_loss_clip": 0.01175721, + "auxiliary_loss_mlp": 0.01024664, + "balance_loss_clip": 1.05043006, + "balance_loss_mlp": 1.01685023, + "epoch": 0.6312751758552275, + "flos": 41902545075840.0, + "grad_norm": 2.2658316028768093, + "language_loss": 0.65159422, + "learning_rate": 1.2644172698420903e-06, + "loss": 0.67359805, + "num_input_tokens_seen": 113056805, + "step": 5250, + "time_per_iteration": 3.312739372253418 + }, + { + "auxiliary_loss_clip": 0.01131324, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.04585195, + "balance_loss_mlp": 1.02242541, + "epoch": 0.6313954187458667, + "flos": 19646800266240.0, + "grad_norm": 1.796287760389685, + "language_loss": 0.84842646, + "learning_rate": 1.2636929534263892e-06, + "loss": 0.87004495, + "num_input_tokens_seen": 113075790, + "step": 5251, + "time_per_iteration": 2.5325424671173096 + }, + { + "auxiliary_loss_clip": 0.01131442, + "auxiliary_loss_mlp": 0.01023171, + "balance_loss_clip": 1.04186857, + "balance_loss_mlp": 1.01551795, + "epoch": 0.6315156616365057, + "flos": 22894273906560.0, + "grad_norm": 1.7231176928805898, + "language_loss": 0.77580416, + "learning_rate": 1.2629687487078821e-06, + "loss": 0.79735029, + "num_input_tokens_seen": 113094600, + "step": 5252, + "time_per_iteration": 2.551456928253174 + }, + { + "auxiliary_loss_clip": 0.01162681, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.04751658, + "balance_loss_mlp": 1.02010226, + "epoch": 0.6316359045271448, + "flos": 23726251699200.0, + "grad_norm": 2.164319552760073, + "language_loss": 0.76402354, + "learning_rate": 1.2622446557964293e-06, + "loss": 0.78593576, + "num_input_tokens_seen": 113112605, + "step": 5253, + "time_per_iteration": 2.5192368030548096 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.04325616, + "balance_loss_mlp": 1.02047586, + "epoch": 0.631756147417784, + "flos": 33108417164160.0, + "grad_norm": 1.6950993714776457, + "language_loss": 0.71356583, + "learning_rate": 1.261520674801876e-06, + "loss": 0.73527753, + "num_input_tokens_seen": 113133200, + "step": 5254, + "time_per_iteration": 2.6268553733825684 + }, + { + "auxiliary_loss_clip": 0.01146707, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.05171251, + "balance_loss_mlp": 1.01870036, + "epoch": 0.631876390308423, + "flos": 31248424126080.0, + "grad_norm": 1.892572863066902, + "language_loss": 0.72500235, + "learning_rate": 1.2607968058340488e-06, + "loss": 0.74673533, + "num_input_tokens_seen": 113152895, + "step": 5255, + "time_per_iteration": 2.5809309482574463 + }, + { + "auxiliary_loss_clip": 0.01140263, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.04521251, + "balance_loss_mlp": 1.0217886, + "epoch": 0.6319966331990621, + "flos": 24681152810880.0, + "grad_norm": 1.7664565370931442, + "language_loss": 0.73207307, + "learning_rate": 1.2600730490027583e-06, + "loss": 0.75377136, + "num_input_tokens_seen": 113173135, + "step": 5256, + "time_per_iteration": 2.557288408279419 + }, + { + "auxiliary_loss_clip": 0.01131318, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.04668927, + "balance_loss_mlp": 1.0197866, + "epoch": 0.6321168760897012, + "flos": 17491764913920.0, + "grad_norm": 1.584690559047317, + "language_loss": 0.80507481, + "learning_rate": 1.2593494044177984e-06, + "loss": 0.82666349, + "num_input_tokens_seen": 113191440, + "step": 5257, + "time_per_iteration": 2.5556273460388184 + }, + { + "auxiliary_loss_clip": 0.011768, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.04890287, + "balance_loss_mlp": 1.01830173, + "epoch": 0.6322371189803403, + "flos": 18295373940480.0, + "grad_norm": 2.251341422288516, + "language_loss": 0.80602717, + "learning_rate": 1.2586258721889448e-06, + "loss": 0.8280592, + "num_input_tokens_seen": 113208790, + "step": 5258, + "time_per_iteration": 2.4100778102874756 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.0102647, + "balance_loss_clip": 1.0454185, + "balance_loss_mlp": 1.01876307, + "epoch": 0.6323573618709794, + "flos": 20157270399360.0, + "grad_norm": 1.9190900136527766, + "language_loss": 0.81956846, + "learning_rate": 1.2579024524259573e-06, + "loss": 0.84094512, + "num_input_tokens_seen": 113225050, + "step": 5259, + "time_per_iteration": 2.5287811756134033 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.04355669, + "balance_loss_mlp": 1.01740742, + "epoch": 0.6324776047616185, + "flos": 20042391726720.0, + "grad_norm": 2.081989733055051, + "language_loss": 0.91321421, + "learning_rate": 1.2571791452385768e-06, + "loss": 0.93486321, + "num_input_tokens_seen": 113242315, + "step": 5260, + "time_per_iteration": 2.480090618133545 + }, + { + "auxiliary_loss_clip": 0.01146337, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.04780018, + "balance_loss_mlp": 1.02333784, + "epoch": 0.6325978476522576, + "flos": 30848235724800.0, + "grad_norm": 1.567618844513022, + "language_loss": 0.77061307, + "learning_rate": 1.2564559507365301e-06, + "loss": 0.79238868, + "num_input_tokens_seen": 113264720, + "step": 5261, + "time_per_iteration": 2.5631837844848633 + }, + { + "auxiliary_loss_clip": 0.01145494, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.04648447, + "balance_loss_mlp": 1.01751292, + "epoch": 0.6327180905428966, + "flos": 24535104111360.0, + "grad_norm": 2.611555615167692, + "language_loss": 0.78617263, + "learning_rate": 1.2557328690295244e-06, + "loss": 0.80788732, + "num_input_tokens_seen": 113282910, + "step": 5262, + "time_per_iteration": 2.5441393852233887 + }, + { + "auxiliary_loss_clip": 0.01137522, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.04886937, + "balance_loss_mlp": 1.01872611, + "epoch": 0.6328383334335358, + "flos": 21575274583680.0, + "grad_norm": 1.863550005022295, + "language_loss": 0.76022011, + "learning_rate": 1.255009900227251e-06, + "loss": 0.78185952, + "num_input_tokens_seen": 113301935, + "step": 5263, + "time_per_iteration": 2.5330796241760254 + }, + { + "auxiliary_loss_clip": 0.01170232, + "auxiliary_loss_mlp": 0.01024154, + "balance_loss_clip": 1.05040181, + "balance_loss_mlp": 1.01716805, + "epoch": 0.6329585763241748, + "flos": 22929861306240.0, + "grad_norm": 1.7881547881459787, + "language_loss": 0.79132324, + "learning_rate": 1.254287044439383e-06, + "loss": 0.81326705, + "num_input_tokens_seen": 113321540, + "step": 5264, + "time_per_iteration": 2.437756299972534 + }, + { + "auxiliary_loss_clip": 0.0107299, + "auxiliary_loss_mlp": 0.01001186, + "balance_loss_clip": 1.01553082, + "balance_loss_mlp": 1.00032794, + "epoch": 0.6330788192148139, + "flos": 70936897847040.0, + "grad_norm": 0.779024323654976, + "language_loss": 0.54449677, + "learning_rate": 1.2535643017755776e-06, + "loss": 0.56523854, + "num_input_tokens_seen": 113383730, + "step": 5265, + "time_per_iteration": 3.1355056762695312 + }, + { + "auxiliary_loss_clip": 0.01130683, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.04431105, + "balance_loss_mlp": 1.02402544, + "epoch": 0.6331990621054531, + "flos": 21244501215360.0, + "grad_norm": 2.595743669949413, + "language_loss": 0.72268641, + "learning_rate": 1.2528416723454737e-06, + "loss": 0.74431634, + "num_input_tokens_seen": 113400400, + "step": 5266, + "time_per_iteration": 2.511258602142334 + }, + { + "auxiliary_loss_clip": 0.01171116, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.05074954, + "balance_loss_mlp": 1.01669884, + "epoch": 0.6333193049960921, + "flos": 34459412526720.0, + "grad_norm": 1.4988909305434233, + "language_loss": 0.70961404, + "learning_rate": 1.2521191562586945e-06, + "loss": 0.73156214, + "num_input_tokens_seen": 113424050, + "step": 5267, + "time_per_iteration": 2.527148485183716 + }, + { + "auxiliary_loss_clip": 0.01174584, + "auxiliary_loss_mlp": 0.007623, + "balance_loss_clip": 1.05160499, + "balance_loss_mlp": 1.00047827, + "epoch": 0.6334395478867312, + "flos": 18329883932160.0, + "grad_norm": 2.3017376266098637, + "language_loss": 0.76833642, + "learning_rate": 1.2513967536248445e-06, + "loss": 0.78770524, + "num_input_tokens_seen": 113440370, + "step": 5268, + "time_per_iteration": 2.4046616554260254 + }, + { + "auxiliary_loss_clip": 0.01154731, + "auxiliary_loss_mlp": 0.01028181, + "balance_loss_clip": 1.04934096, + "balance_loss_mlp": 1.02093577, + "epoch": 0.6335597907773702, + "flos": 23623152687360.0, + "grad_norm": 1.5746507331262924, + "language_loss": 0.80809134, + "learning_rate": 1.2506744645535117e-06, + "loss": 0.82992047, + "num_input_tokens_seen": 113460800, + "step": 5269, + "time_per_iteration": 3.2198593616485596 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01021628, + "balance_loss_clip": 1.04161441, + "balance_loss_mlp": 1.01389718, + "epoch": 0.6336800336680094, + "flos": 22710913954560.0, + "grad_norm": 7.694002005438443, + "language_loss": 0.604617, + "learning_rate": 1.249952289154267e-06, + "loss": 0.62619817, + "num_input_tokens_seen": 113480840, + "step": 5270, + "time_per_iteration": 2.5100300312042236 + }, + { + "auxiliary_loss_clip": 0.01090882, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.04044759, + "balance_loss_mlp": 1.0215013, + "epoch": 0.6338002765586485, + "flos": 23622757637760.0, + "grad_norm": 2.075392247291788, + "language_loss": 0.76413989, + "learning_rate": 1.2492302275366635e-06, + "loss": 0.78533655, + "num_input_tokens_seen": 113500515, + "step": 5271, + "time_per_iteration": 2.5960912704467773 + }, + { + "auxiliary_loss_clip": 0.01152743, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.04660082, + "balance_loss_mlp": 1.02003384, + "epoch": 0.6339205194492875, + "flos": 26505450708480.0, + "grad_norm": 2.546027301746887, + "language_loss": 0.65856743, + "learning_rate": 1.2485082798102377e-06, + "loss": 0.6803726, + "num_input_tokens_seen": 113520930, + "step": 5272, + "time_per_iteration": 2.497006416320801 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01024821, + "balance_loss_clip": 1.04543376, + "balance_loss_mlp": 1.01626766, + "epoch": 0.6340407623399267, + "flos": 18544306170240.0, + "grad_norm": 2.237009596368795, + "language_loss": 0.68699163, + "learning_rate": 1.2477864460845084e-06, + "loss": 0.70859838, + "num_input_tokens_seen": 113537330, + "step": 5273, + "time_per_iteration": 3.34025239944458 + }, + { + "auxiliary_loss_clip": 0.01143964, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.0454905, + "balance_loss_mlp": 1.01764035, + "epoch": 0.6341610052305657, + "flos": 17712579772800.0, + "grad_norm": 2.7675679732899767, + "language_loss": 0.7315557, + "learning_rate": 1.2470647264689776e-06, + "loss": 0.75325084, + "num_input_tokens_seen": 113555810, + "step": 5274, + "time_per_iteration": 3.3289694786071777 + }, + { + "auxiliary_loss_clip": 0.01110216, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.04107213, + "balance_loss_mlp": 1.01845288, + "epoch": 0.6342812481212048, + "flos": 23587026583680.0, + "grad_norm": 15.072758610497422, + "language_loss": 0.70952404, + "learning_rate": 1.2463431210731282e-06, + "loss": 0.73089075, + "num_input_tokens_seen": 113575395, + "step": 5275, + "time_per_iteration": 2.6223363876342773 + }, + { + "auxiliary_loss_clip": 0.01125579, + "auxiliary_loss_mlp": 0.01022951, + "balance_loss_clip": 1.04233372, + "balance_loss_mlp": 1.01567364, + "epoch": 0.634401491011844, + "flos": 17821927751040.0, + "grad_norm": 2.4653105526294388, + "language_loss": 0.76252979, + "learning_rate": 1.2456216300064289e-06, + "loss": 0.78401506, + "num_input_tokens_seen": 113592945, + "step": 5276, + "time_per_iteration": 3.3060522079467773 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.01025351, + "balance_loss_clip": 1.04329205, + "balance_loss_mlp": 1.01741791, + "epoch": 0.634521733902483, + "flos": 21358158825600.0, + "grad_norm": 1.6074230619354897, + "language_loss": 0.78452903, + "learning_rate": 1.244900253378328e-06, + "loss": 0.80606449, + "num_input_tokens_seen": 113613000, + "step": 5277, + "time_per_iteration": 2.549111843109131 + }, + { + "auxiliary_loss_clip": 0.01086331, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.04237294, + "balance_loss_mlp": 1.01922965, + "epoch": 0.6346419767931221, + "flos": 16545052103040.0, + "grad_norm": 1.8858344114320909, + "language_loss": 0.69168562, + "learning_rate": 1.2441789912982583e-06, + "loss": 0.71281463, + "num_input_tokens_seen": 113630085, + "step": 5278, + "time_per_iteration": 2.5837655067443848 + }, + { + "auxiliary_loss_clip": 0.01162283, + "auxiliary_loss_mlp": 0.01022699, + "balance_loss_clip": 1.04936278, + "balance_loss_mlp": 1.0149262, + "epoch": 0.6347622196837612, + "flos": 24350989973760.0, + "grad_norm": 1.8158614743689814, + "language_loss": 0.64959836, + "learning_rate": 1.2434578438756346e-06, + "loss": 0.67144823, + "num_input_tokens_seen": 113650515, + "step": 5279, + "time_per_iteration": 2.507667064666748 + }, + { + "auxiliary_loss_clip": 0.01157929, + "auxiliary_loss_mlp": 0.01022055, + "balance_loss_clip": 1.04682398, + "balance_loss_mlp": 1.01469946, + "epoch": 0.6348824625744003, + "flos": 64523178195840.0, + "grad_norm": 1.9204991286030741, + "language_loss": 0.77853382, + "learning_rate": 1.242736811219855e-06, + "loss": 0.80033362, + "num_input_tokens_seen": 113676475, + "step": 5280, + "time_per_iteration": 2.8520243167877197 + }, + { + "auxiliary_loss_clip": 0.01154301, + "auxiliary_loss_mlp": 0.01024287, + "balance_loss_clip": 1.04774368, + "balance_loss_mlp": 1.01656866, + "epoch": 0.6350027054650393, + "flos": 28622133313920.0, + "grad_norm": 2.0580047871682092, + "language_loss": 0.81554079, + "learning_rate": 1.2420158934402988e-06, + "loss": 0.83732677, + "num_input_tokens_seen": 113697090, + "step": 5281, + "time_per_iteration": 2.518745183944702 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.01024988, + "balance_loss_clip": 1.04131126, + "balance_loss_mlp": 1.01732886, + "epoch": 0.6351229483556785, + "flos": 23002544476800.0, + "grad_norm": 1.8350585422268777, + "language_loss": 0.84928542, + "learning_rate": 1.2412950906463286e-06, + "loss": 0.87069726, + "num_input_tokens_seen": 113714395, + "step": 5282, + "time_per_iteration": 2.5169124603271484 + }, + { + "auxiliary_loss_clip": 0.01115645, + "auxiliary_loss_mlp": 0.01026344, + "balance_loss_clip": 1.0443542, + "balance_loss_mlp": 1.01885462, + "epoch": 0.6352431912463176, + "flos": 21939300967680.0, + "grad_norm": 1.7217629415082754, + "language_loss": 0.89771909, + "learning_rate": 1.2405744029472902e-06, + "loss": 0.91913891, + "num_input_tokens_seen": 113733880, + "step": 5283, + "time_per_iteration": 2.570225715637207 + }, + { + "auxiliary_loss_clip": 0.01141719, + "auxiliary_loss_mlp": 0.01024448, + "balance_loss_clip": 1.04499722, + "balance_loss_mlp": 1.01726615, + "epoch": 0.6353634341369566, + "flos": 13735257684480.0, + "grad_norm": 2.3437238379443532, + "language_loss": 0.76385826, + "learning_rate": 1.2398538304525108e-06, + "loss": 0.78551996, + "num_input_tokens_seen": 113752505, + "step": 5284, + "time_per_iteration": 2.468639612197876 + }, + { + "auxiliary_loss_clip": 0.01117207, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.04538131, + "balance_loss_mlp": 1.0256952, + "epoch": 0.6354836770275958, + "flos": 19316170552320.0, + "grad_norm": 1.973459835833161, + "language_loss": 0.75322002, + "learning_rate": 1.2391333732713016e-06, + "loss": 0.77473271, + "num_input_tokens_seen": 113770310, + "step": 5285, + "time_per_iteration": 2.5357022285461426 + }, + { + "auxiliary_loss_clip": 0.01121697, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.04391575, + "balance_loss_mlp": 1.02874374, + "epoch": 0.6356039199182348, + "flos": 21613375935360.0, + "grad_norm": 2.091643414321461, + "language_loss": 0.78670728, + "learning_rate": 1.2384130315129543e-06, + "loss": 0.80830371, + "num_input_tokens_seen": 113788635, + "step": 5286, + "time_per_iteration": 2.5537593364715576 + }, + { + "auxiliary_loss_clip": 0.01091519, + "auxiliary_loss_mlp": 0.0102581, + "balance_loss_clip": 1.04289269, + "balance_loss_mlp": 1.01763833, + "epoch": 0.6357241628088739, + "flos": 18111978074880.0, + "grad_norm": 2.5820126966692167, + "language_loss": 0.73408329, + "learning_rate": 1.2376928052867447e-06, + "loss": 0.75525659, + "num_input_tokens_seen": 113807755, + "step": 5287, + "time_per_iteration": 2.60221266746521 + }, + { + "auxiliary_loss_clip": 0.01145728, + "auxiliary_loss_mlp": 0.01026239, + "balance_loss_clip": 1.04870617, + "balance_loss_mlp": 1.01874113, + "epoch": 0.6358444056995131, + "flos": 24935256599040.0, + "grad_norm": 1.900953781513786, + "language_loss": 0.77475965, + "learning_rate": 1.2369726947019299e-06, + "loss": 0.79647923, + "num_input_tokens_seen": 113828230, + "step": 5288, + "time_per_iteration": 2.5208442211151123 + }, + { + "auxiliary_loss_clip": 0.01156505, + "auxiliary_loss_mlp": 0.01020329, + "balance_loss_clip": 1.04632115, + "balance_loss_mlp": 1.01276457, + "epoch": 0.6359646485901521, + "flos": 23293348986240.0, + "grad_norm": 2.162948496529071, + "language_loss": 0.67297655, + "learning_rate": 1.2362526998677511e-06, + "loss": 0.69474483, + "num_input_tokens_seen": 113844595, + "step": 5289, + "time_per_iteration": 2.457869529724121 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.04748273, + "balance_loss_mlp": 1.01988482, + "epoch": 0.6360848914807912, + "flos": 20887442069760.0, + "grad_norm": 1.8536267063473426, + "language_loss": 0.84292012, + "learning_rate": 1.2355328208934301e-06, + "loss": 0.86467028, + "num_input_tokens_seen": 113863470, + "step": 5290, + "time_per_iteration": 2.492821216583252 + }, + { + "auxiliary_loss_clip": 0.01156999, + "auxiliary_loss_mlp": 0.00762547, + "balance_loss_clip": 1.04509509, + "balance_loss_mlp": 1.00057101, + "epoch": 0.6362051343714303, + "flos": 18479775386880.0, + "grad_norm": 1.6114396935860047, + "language_loss": 0.7220962, + "learning_rate": 1.2348130578881728e-06, + "loss": 0.74129164, + "num_input_tokens_seen": 113881690, + "step": 5291, + "time_per_iteration": 2.4520437717437744 + }, + { + "auxiliary_loss_clip": 0.01174524, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.04988742, + "balance_loss_mlp": 1.02157152, + "epoch": 0.6363253772620694, + "flos": 24389594115840.0, + "grad_norm": 2.519250035934543, + "language_loss": 0.76353866, + "learning_rate": 1.2340934109611664e-06, + "loss": 0.78557956, + "num_input_tokens_seen": 113902450, + "step": 5292, + "time_per_iteration": 2.452216148376465 + }, + { + "auxiliary_loss_clip": 0.01150948, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.04919553, + "balance_loss_mlp": 1.01784182, + "epoch": 0.6364456201527084, + "flos": 25958243940480.0, + "grad_norm": 1.9950303360599584, + "language_loss": 0.68935609, + "learning_rate": 1.2333738802215798e-06, + "loss": 0.71112996, + "num_input_tokens_seen": 113922670, + "step": 5293, + "time_per_iteration": 2.5406911373138428 + }, + { + "auxiliary_loss_clip": 0.01108759, + "auxiliary_loss_mlp": 0.01025938, + "balance_loss_clip": 1.04078245, + "balance_loss_mlp": 1.01828742, + "epoch": 0.6365658630433476, + "flos": 20740711011840.0, + "grad_norm": 1.8296366015077776, + "language_loss": 0.80890852, + "learning_rate": 1.2326544657785668e-06, + "loss": 0.83025551, + "num_input_tokens_seen": 113942360, + "step": 5294, + "time_per_iteration": 2.544131278991699 + }, + { + "auxiliary_loss_clip": 0.01122527, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.04422414, + "balance_loss_mlp": 1.02450192, + "epoch": 0.6366861059339867, + "flos": 21434146047360.0, + "grad_norm": 2.83681433898636, + "language_loss": 0.74706316, + "learning_rate": 1.2319351677412608e-06, + "loss": 0.76861858, + "num_input_tokens_seen": 113959405, + "step": 5295, + "time_per_iteration": 3.3298044204711914 + }, + { + "auxiliary_loss_clip": 0.01140489, + "auxiliary_loss_mlp": 0.01026381, + "balance_loss_clip": 1.04899931, + "balance_loss_mlp": 1.01842988, + "epoch": 0.6368063488246257, + "flos": 22267093507200.0, + "grad_norm": 2.28397664650324, + "language_loss": 0.74407566, + "learning_rate": 1.2312159862187796e-06, + "loss": 0.76574433, + "num_input_tokens_seen": 113977815, + "step": 5296, + "time_per_iteration": 2.5113227367401123 + }, + { + "auxiliary_loss_clip": 0.01177998, + "auxiliary_loss_mlp": 0.01034236, + "balance_loss_clip": 1.05291986, + "balance_loss_mlp": 1.02645445, + "epoch": 0.6369265917152649, + "flos": 22420719976320.0, + "grad_norm": 2.688298999957881, + "language_loss": 0.75958943, + "learning_rate": 1.2304969213202217e-06, + "loss": 0.78171176, + "num_input_tokens_seen": 113999075, + "step": 5297, + "time_per_iteration": 2.453690767288208 + }, + { + "auxiliary_loss_clip": 0.01139792, + "auxiliary_loss_mlp": 0.01024813, + "balance_loss_clip": 1.0453496, + "balance_loss_mlp": 1.01769006, + "epoch": 0.6370468346059039, + "flos": 24718176754560.0, + "grad_norm": 2.2552513326488963, + "language_loss": 0.79033393, + "learning_rate": 1.2297779731546692e-06, + "loss": 0.81198001, + "num_input_tokens_seen": 114018170, + "step": 5298, + "time_per_iteration": 2.5117056369781494 + }, + { + "auxiliary_loss_clip": 0.01143514, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.04846466, + "balance_loss_mlp": 1.02180135, + "epoch": 0.637167077496543, + "flos": 25296589463040.0, + "grad_norm": 1.9199288917648691, + "language_loss": 0.78064179, + "learning_rate": 1.2290591418311853e-06, + "loss": 0.80237395, + "num_input_tokens_seen": 114035565, + "step": 5299, + "time_per_iteration": 3.3526906967163086 + }, + { + "auxiliary_loss_clip": 0.01161287, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.05219555, + "balance_loss_mlp": 1.01969123, + "epoch": 0.637287320387182, + "flos": 27671110871040.0, + "grad_norm": 1.6197946487048385, + "language_loss": 0.72045326, + "learning_rate": 1.2283404274588172e-06, + "loss": 0.7423389, + "num_input_tokens_seen": 114054510, + "step": 5300, + "time_per_iteration": 3.367604970932007 + }, + { + "auxiliary_loss_clip": 0.00996388, + "auxiliary_loss_mlp": 0.01004192, + "balance_loss_clip": 1.01321888, + "balance_loss_mlp": 1.00309527, + "epoch": 0.6374075632778212, + "flos": 63173406873600.0, + "grad_norm": 0.7427944534174411, + "language_loss": 0.52826333, + "learning_rate": 1.227621830146592e-06, + "loss": 0.54826909, + "num_input_tokens_seen": 114109875, + "step": 5301, + "time_per_iteration": 3.1422998905181885 + }, + { + "auxiliary_loss_clip": 0.01139577, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.05139172, + "balance_loss_mlp": 1.02269101, + "epoch": 0.6375278061684603, + "flos": 25558127366400.0, + "grad_norm": 1.9973624382908963, + "language_loss": 0.79488832, + "learning_rate": 1.2269033500035217e-06, + "loss": 0.81658697, + "num_input_tokens_seen": 114130010, + "step": 5302, + "time_per_iteration": 2.84183406829834 + }, + { + "auxiliary_loss_clip": 0.011305, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.04673529, + "balance_loss_mlp": 1.02270937, + "epoch": 0.6376480490590993, + "flos": 25666362023040.0, + "grad_norm": 1.6599732459490695, + "language_loss": 0.73446679, + "learning_rate": 1.2261849871385988e-06, + "loss": 0.7560724, + "num_input_tokens_seen": 114151115, + "step": 5303, + "time_per_iteration": 3.293503522872925 + }, + { + "auxiliary_loss_clip": 0.01174423, + "auxiliary_loss_mlp": 0.01023777, + "balance_loss_clip": 1.04949498, + "balance_loss_mlp": 1.01572454, + "epoch": 0.6377682919497385, + "flos": 31537684350720.0, + "grad_norm": 2.2285008761150786, + "language_loss": 0.6242066, + "learning_rate": 1.2254667416607972e-06, + "loss": 0.64618862, + "num_input_tokens_seen": 114172715, + "step": 5304, + "time_per_iteration": 2.506847858428955 + }, + { + "auxiliary_loss_clip": 0.01157954, + "auxiliary_loss_mlp": 0.01022167, + "balance_loss_clip": 1.0504241, + "balance_loss_mlp": 1.01438594, + "epoch": 0.6378885348403776, + "flos": 23039209284480.0, + "grad_norm": 1.8430296024005839, + "language_loss": 0.82953912, + "learning_rate": 1.2247486136790756e-06, + "loss": 0.85134029, + "num_input_tokens_seen": 114192195, + "step": 5305, + "time_per_iteration": 2.461240291595459 + }, + { + "auxiliary_loss_clip": 0.01163064, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.05182195, + "balance_loss_mlp": 1.02541375, + "epoch": 0.6380087777310166, + "flos": 18697070712960.0, + "grad_norm": 2.080045351034289, + "language_loss": 0.80872786, + "learning_rate": 1.2240306033023726e-06, + "loss": 0.83068776, + "num_input_tokens_seen": 114210020, + "step": 5306, + "time_per_iteration": 2.432070255279541 + }, + { + "auxiliary_loss_clip": 0.01131885, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.04243898, + "balance_loss_mlp": 1.02046561, + "epoch": 0.6381290206216558, + "flos": 23331558078720.0, + "grad_norm": 1.8907804918699203, + "language_loss": 0.72097391, + "learning_rate": 1.223312710639611e-06, + "loss": 0.74257857, + "num_input_tokens_seen": 114228740, + "step": 5307, + "time_per_iteration": 2.52185320854187 + }, + { + "auxiliary_loss_clip": 0.01144887, + "auxiliary_loss_mlp": 0.01026909, + "balance_loss_clip": 1.04798126, + "balance_loss_mlp": 1.01893127, + "epoch": 0.6382492635122948, + "flos": 18880466578560.0, + "grad_norm": 2.1403445603410773, + "language_loss": 0.86983752, + "learning_rate": 1.2225949357996928e-06, + "loss": 0.89155549, + "num_input_tokens_seen": 114246865, + "step": 5308, + "time_per_iteration": 2.453169107437134 + }, + { + "auxiliary_loss_clip": 0.01156393, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.049945, + "balance_loss_mlp": 1.01925528, + "epoch": 0.6383695064029339, + "flos": 27819134818560.0, + "grad_norm": 1.4945758287424313, + "language_loss": 0.80474615, + "learning_rate": 1.221877278891505e-06, + "loss": 0.82657439, + "num_input_tokens_seen": 114266120, + "step": 5309, + "time_per_iteration": 2.493582248687744 + }, + { + "auxiliary_loss_clip": 0.01166133, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.05068755, + "balance_loss_mlp": 1.0235033, + "epoch": 0.638489749293573, + "flos": 26395635853440.0, + "grad_norm": 1.9957006960726937, + "language_loss": 0.71371841, + "learning_rate": 1.221159740023915e-06, + "loss": 0.7356984, + "num_input_tokens_seen": 114285950, + "step": 5310, + "time_per_iteration": 2.477654218673706 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.00762534, + "balance_loss_clip": 1.04776287, + "balance_loss_mlp": 1.00055432, + "epoch": 0.6386099921842121, + "flos": 23988328306560.0, + "grad_norm": 1.8415529101352488, + "language_loss": 0.72419107, + "learning_rate": 1.2204423193057735e-06, + "loss": 0.74322188, + "num_input_tokens_seen": 114304780, + "step": 5311, + "time_per_iteration": 2.5925655364990234 + }, + { + "auxiliary_loss_clip": 0.01052044, + "auxiliary_loss_mlp": 0.0100202, + "balance_loss_clip": 1.01491857, + "balance_loss_mlp": 1.00102472, + "epoch": 0.6387302350748512, + "flos": 71731169337600.0, + "grad_norm": 0.8919463888173281, + "language_loss": 0.63398248, + "learning_rate": 1.2197250168459122e-06, + "loss": 0.65452307, + "num_input_tokens_seen": 114361180, + "step": 5312, + "time_per_iteration": 3.088937997817993 + }, + { + "auxiliary_loss_clip": 0.01161212, + "auxiliary_loss_mlp": 0.01025727, + "balance_loss_clip": 1.04964519, + "balance_loss_mlp": 1.01815724, + "epoch": 0.6388504779654903, + "flos": 14535778141440.0, + "grad_norm": 1.9262403540987723, + "language_loss": 0.74465376, + "learning_rate": 1.2190078327531454e-06, + "loss": 0.76652312, + "num_input_tokens_seen": 114377425, + "step": 5313, + "time_per_iteration": 2.45031476020813 + }, + { + "auxiliary_loss_clip": 0.01159214, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.04774022, + "balance_loss_mlp": 1.02214575, + "epoch": 0.6389707208561294, + "flos": 22346133384960.0, + "grad_norm": 1.4173791068018777, + "language_loss": 0.7263236, + "learning_rate": 1.2182907671362697e-06, + "loss": 0.74821031, + "num_input_tokens_seen": 114398120, + "step": 5314, + "time_per_iteration": 2.528268814086914 + }, + { + "auxiliary_loss_clip": 0.0115957, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.05124092, + "balance_loss_mlp": 1.01796103, + "epoch": 0.6390909637467684, + "flos": 19426883247360.0, + "grad_norm": 1.7790856853570518, + "language_loss": 0.78865641, + "learning_rate": 1.2175738201040626e-06, + "loss": 0.81050956, + "num_input_tokens_seen": 114415160, + "step": 5315, + "time_per_iteration": 2.4518160820007324 + }, + { + "auxiliary_loss_clip": 0.01157315, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.04750371, + "balance_loss_mlp": 1.02150989, + "epoch": 0.6392112066374076, + "flos": 24090852700800.0, + "grad_norm": 1.7995876809726286, + "language_loss": 0.78570986, + "learning_rate": 1.2168569917652855e-06, + "loss": 0.80757391, + "num_input_tokens_seen": 114435015, + "step": 5316, + "time_per_iteration": 2.4894604682922363 + }, + { + "auxiliary_loss_clip": 0.01161029, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.05093455, + "balance_loss_mlp": 1.01529157, + "epoch": 0.6393314495280467, + "flos": 26795141896320.0, + "grad_norm": 1.5124650012420744, + "language_loss": 0.63884765, + "learning_rate": 1.2161402822286797e-06, + "loss": 0.66069144, + "num_input_tokens_seen": 114455700, + "step": 5317, + "time_per_iteration": 2.4946131706237793 + }, + { + "auxiliary_loss_clip": 0.01127592, + "auxiliary_loss_mlp": 0.01021914, + "balance_loss_clip": 1.04453397, + "balance_loss_mlp": 1.01433849, + "epoch": 0.6394516924186857, + "flos": 20260692633600.0, + "grad_norm": 2.1216247270816617, + "language_loss": 0.78773034, + "learning_rate": 1.2154236916029703e-06, + "loss": 0.80922544, + "num_input_tokens_seen": 114473675, + "step": 5318, + "time_per_iteration": 2.497843027114868 + }, + { + "auxiliary_loss_clip": 0.01115721, + "auxiliary_loss_mlp": 0.01024901, + "balance_loss_clip": 1.04071355, + "balance_loss_mlp": 1.01742077, + "epoch": 0.6395719353093249, + "flos": 18368847210240.0, + "grad_norm": 2.534705224720435, + "language_loss": 0.73448962, + "learning_rate": 1.2147072199968627e-06, + "loss": 0.75589585, + "num_input_tokens_seen": 114492310, + "step": 5319, + "time_per_iteration": 2.5192885398864746 + }, + { + "auxiliary_loss_clip": 0.01156808, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.04859042, + "balance_loss_mlp": 1.01899624, + "epoch": 0.6396921781999639, + "flos": 17566315591680.0, + "grad_norm": 1.687268839794007, + "language_loss": 0.71678048, + "learning_rate": 1.2139908675190454e-06, + "loss": 0.73861033, + "num_input_tokens_seen": 114511520, + "step": 5320, + "time_per_iteration": 2.4200022220611572 + }, + { + "auxiliary_loss_clip": 0.01091807, + "auxiliary_loss_mlp": 0.01025279, + "balance_loss_clip": 1.03890812, + "balance_loss_mlp": 1.01778078, + "epoch": 0.639812421090603, + "flos": 21251252972160.0, + "grad_norm": 2.1242980023188798, + "language_loss": 0.75461257, + "learning_rate": 1.2132746342781883e-06, + "loss": 0.77578342, + "num_input_tokens_seen": 114532680, + "step": 5321, + "time_per_iteration": 2.600229024887085 + }, + { + "auxiliary_loss_clip": 0.0117499, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.05134273, + "balance_loss_mlp": 1.0186435, + "epoch": 0.6399326639812422, + "flos": 11180967684480.0, + "grad_norm": 2.4183223498876187, + "language_loss": 0.79890311, + "learning_rate": 1.2125585203829442e-06, + "loss": 0.82092297, + "num_input_tokens_seen": 114548320, + "step": 5322, + "time_per_iteration": 3.1619529724121094 + }, + { + "auxiliary_loss_clip": 0.01118718, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.04556656, + "balance_loss_mlp": 1.02222228, + "epoch": 0.6400529068718812, + "flos": 23911048195200.0, + "grad_norm": 4.753229500658536, + "language_loss": 0.74411368, + "learning_rate": 1.211842525941946e-06, + "loss": 0.76560318, + "num_input_tokens_seen": 114568115, + "step": 5323, + "time_per_iteration": 2.5193850994110107 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.04585505, + "balance_loss_mlp": 1.01857877, + "epoch": 0.6401731497625203, + "flos": 44018724890880.0, + "grad_norm": 1.8372634232212812, + "language_loss": 0.78586113, + "learning_rate": 1.2111266510638105e-06, + "loss": 0.80724972, + "num_input_tokens_seen": 114591040, + "step": 5324, + "time_per_iteration": 2.7600488662719727 + }, + { + "auxiliary_loss_clip": 0.01094668, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.04182291, + "balance_loss_mlp": 1.01886964, + "epoch": 0.6402933926531594, + "flos": 20662209838080.0, + "grad_norm": 1.7788410479294903, + "language_loss": 0.80138856, + "learning_rate": 1.2104108958571346e-06, + "loss": 0.82259917, + "num_input_tokens_seen": 114609310, + "step": 5325, + "time_per_iteration": 2.6203114986419678 + }, + { + "auxiliary_loss_clip": 0.01155702, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.04929554, + "balance_loss_mlp": 1.02068889, + "epoch": 0.6404136355437985, + "flos": 24863327614080.0, + "grad_norm": 1.4649566621947558, + "language_loss": 0.75913548, + "learning_rate": 1.2096952604304975e-06, + "loss": 0.7809732, + "num_input_tokens_seen": 114629740, + "step": 5326, + "time_per_iteration": 3.3330068588256836 + }, + { + "auxiliary_loss_clip": 0.01160867, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.04821277, + "balance_loss_mlp": 1.01840734, + "epoch": 0.6405338784344375, + "flos": 40479548901120.0, + "grad_norm": 2.3478994769926786, + "language_loss": 0.705194, + "learning_rate": 1.2089797448924616e-06, + "loss": 0.72706544, + "num_input_tokens_seen": 114653615, + "step": 5327, + "time_per_iteration": 3.415560007095337 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.0423646, + "balance_loss_mlp": 1.01864362, + "epoch": 0.6406541213250767, + "flos": 20886041439360.0, + "grad_norm": 2.0119672809734164, + "language_loss": 0.65710902, + "learning_rate": 1.2082643493515692e-06, + "loss": 0.67859972, + "num_input_tokens_seen": 114671935, + "step": 5328, + "time_per_iteration": 2.5431039333343506 + }, + { + "auxiliary_loss_clip": 0.01157258, + "auxiliary_loss_mlp": 0.01024726, + "balance_loss_clip": 1.04804981, + "balance_loss_mlp": 1.01707888, + "epoch": 0.6407743642157158, + "flos": 23295970679040.0, + "grad_norm": 2.0107220421596996, + "language_loss": 0.82066506, + "learning_rate": 1.207549073916346e-06, + "loss": 0.84248489, + "num_input_tokens_seen": 114692870, + "step": 5329, + "time_per_iteration": 3.2290544509887695 + }, + { + "auxiliary_loss_clip": 0.01134585, + "auxiliary_loss_mlp": 0.01022905, + "balance_loss_clip": 1.04556584, + "balance_loss_mlp": 1.01579404, + "epoch": 0.6408946071063548, + "flos": 15012636122880.0, + "grad_norm": 1.8958550359033812, + "language_loss": 0.77789593, + "learning_rate": 1.2068339186952976e-06, + "loss": 0.7994709, + "num_input_tokens_seen": 114710410, + "step": 5330, + "time_per_iteration": 2.5155410766601562 + }, + { + "auxiliary_loss_clip": 0.011611, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.04946446, + "balance_loss_mlp": 1.02127123, + "epoch": 0.6410148499969939, + "flos": 22528595496960.0, + "grad_norm": 2.0442715750028824, + "language_loss": 0.73243988, + "learning_rate": 1.2061188837969136e-06, + "loss": 0.75434262, + "num_input_tokens_seen": 114730020, + "step": 5331, + "time_per_iteration": 2.4662926197052 + }, + { + "auxiliary_loss_clip": 0.01122873, + "auxiliary_loss_mlp": 0.01025999, + "balance_loss_clip": 1.04150641, + "balance_loss_mlp": 1.01804209, + "epoch": 0.641135092887633, + "flos": 12422004537600.0, + "grad_norm": 2.480355896826845, + "language_loss": 0.84041226, + "learning_rate": 1.2054039693296631e-06, + "loss": 0.86190093, + "num_input_tokens_seen": 114748015, + "step": 5332, + "time_per_iteration": 2.5016486644744873 + }, + { + "auxiliary_loss_clip": 0.01125519, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.0445025, + "balance_loss_mlp": 1.02235389, + "epoch": 0.6412553357782721, + "flos": 22127329687680.0, + "grad_norm": 1.8420310628330967, + "language_loss": 0.81520379, + "learning_rate": 1.2046891754019992e-06, + "loss": 0.83675349, + "num_input_tokens_seen": 114768625, + "step": 5333, + "time_per_iteration": 2.536783218383789 + }, + { + "auxiliary_loss_clip": 0.01161449, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.04948533, + "balance_loss_mlp": 1.0180223, + "epoch": 0.6413755786689112, + "flos": 15888605097600.0, + "grad_norm": 1.9282467162316852, + "language_loss": 0.82951975, + "learning_rate": 1.2039745021223548e-06, + "loss": 0.85139, + "num_input_tokens_seen": 114786045, + "step": 5334, + "time_per_iteration": 2.4252753257751465 + }, + { + "auxiliary_loss_clip": 0.01023782, + "auxiliary_loss_mlp": 0.01004669, + "balance_loss_clip": 1.0148356, + "balance_loss_mlp": 1.0038408, + "epoch": 0.6414958215595503, + "flos": 68039159955840.0, + "grad_norm": 0.7914621700395528, + "language_loss": 0.57056165, + "learning_rate": 1.2032599495991456e-06, + "loss": 0.59084612, + "num_input_tokens_seen": 114850785, + "step": 5335, + "time_per_iteration": 3.2243902683258057 + }, + { + "auxiliary_loss_clip": 0.01157959, + "auxiliary_loss_mlp": 0.01023383, + "balance_loss_clip": 1.04883182, + "balance_loss_mlp": 1.01552749, + "epoch": 0.6416160644501894, + "flos": 44091300320640.0, + "grad_norm": 2.070151705950607, + "language_loss": 0.70034564, + "learning_rate": 1.2025455179407685e-06, + "loss": 0.72215903, + "num_input_tokens_seen": 114871945, + "step": 5336, + "time_per_iteration": 2.6606969833374023 + }, + { + "auxiliary_loss_clip": 0.01155571, + "auxiliary_loss_mlp": 0.0076235, + "balance_loss_clip": 1.04840839, + "balance_loss_mlp": 1.00055385, + "epoch": 0.6417363073408284, + "flos": 20959837931520.0, + "grad_norm": 2.0420737735067864, + "language_loss": 0.73590958, + "learning_rate": 1.2018312072556022e-06, + "loss": 0.75508875, + "num_input_tokens_seen": 114890445, + "step": 5337, + "time_per_iteration": 2.4405548572540283 + }, + { + "auxiliary_loss_clip": 0.01168738, + "auxiliary_loss_mlp": 0.00762265, + "balance_loss_clip": 1.04844379, + "balance_loss_mlp": 1.00057149, + "epoch": 0.6418565502314676, + "flos": 22455122227200.0, + "grad_norm": 1.669480341017247, + "language_loss": 0.74216533, + "learning_rate": 1.2011170176520077e-06, + "loss": 0.76147532, + "num_input_tokens_seen": 114911360, + "step": 5338, + "time_per_iteration": 2.431079387664795 + }, + { + "auxiliary_loss_clip": 0.01084319, + "auxiliary_loss_mlp": 0.01023944, + "balance_loss_clip": 1.03932023, + "balance_loss_mlp": 1.01627302, + "epoch": 0.6419767931221066, + "flos": 25045502417280.0, + "grad_norm": 1.6519615999822461, + "language_loss": 0.81233078, + "learning_rate": 1.2004029492383256e-06, + "loss": 0.83341342, + "num_input_tokens_seen": 114932700, + "step": 5339, + "time_per_iteration": 2.6286911964416504 + }, + { + "auxiliary_loss_clip": 0.01156269, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.04970431, + "balance_loss_mlp": 1.01866508, + "epoch": 0.6420970360127457, + "flos": 19463691709440.0, + "grad_norm": 1.9546883748002868, + "language_loss": 0.73321879, + "learning_rate": 1.1996890021228814e-06, + "loss": 0.75504637, + "num_input_tokens_seen": 114949475, + "step": 5340, + "time_per_iteration": 2.434872627258301 + }, + { + "auxiliary_loss_clip": 0.01140382, + "auxiliary_loss_mlp": 0.01024294, + "balance_loss_clip": 1.04528308, + "balance_loss_mlp": 1.01681995, + "epoch": 0.6422172789033849, + "flos": 40406147458560.0, + "grad_norm": 1.5912915364760845, + "language_loss": 0.69944096, + "learning_rate": 1.1989751764139785e-06, + "loss": 0.72108781, + "num_input_tokens_seen": 114973125, + "step": 5341, + "time_per_iteration": 2.653404474258423 + }, + { + "auxiliary_loss_clip": 0.01110681, + "auxiliary_loss_mlp": 0.01023054, + "balance_loss_clip": 1.03812551, + "balance_loss_mlp": 1.01520967, + "epoch": 0.6423375217940239, + "flos": 27672870637440.0, + "grad_norm": 1.62401096740937, + "language_loss": 0.82967788, + "learning_rate": 1.1982614722199044e-06, + "loss": 0.85101521, + "num_input_tokens_seen": 114994300, + "step": 5342, + "time_per_iteration": 2.591768503189087 + }, + { + "auxiliary_loss_clip": 0.01147619, + "auxiliary_loss_mlp": 0.01025007, + "balance_loss_clip": 1.04608083, + "balance_loss_mlp": 1.01809049, + "epoch": 0.642457764684663, + "flos": 18369242259840.0, + "grad_norm": 2.039110277937119, + "language_loss": 0.77809697, + "learning_rate": 1.1975478896489276e-06, + "loss": 0.79982316, + "num_input_tokens_seen": 115012135, + "step": 5343, + "time_per_iteration": 2.470501184463501 + }, + { + "auxiliary_loss_clip": 0.01168439, + "auxiliary_loss_mlp": 0.01023038, + "balance_loss_clip": 1.04824841, + "balance_loss_mlp": 1.01601088, + "epoch": 0.6425780075753021, + "flos": 19750509809280.0, + "grad_norm": 1.9409994790536031, + "language_loss": 0.76477146, + "learning_rate": 1.1968344288092981e-06, + "loss": 0.78668618, + "num_input_tokens_seen": 115028715, + "step": 5344, + "time_per_iteration": 2.40632700920105 + }, + { + "auxiliary_loss_clip": 0.01157403, + "auxiliary_loss_mlp": 0.00762356, + "balance_loss_clip": 1.0491153, + "balance_loss_mlp": 1.00062203, + "epoch": 0.6426982504659412, + "flos": 20558536208640.0, + "grad_norm": 2.5298245518084315, + "language_loss": 0.64473689, + "learning_rate": 1.1961210898092468e-06, + "loss": 0.66393447, + "num_input_tokens_seen": 115047665, + "step": 5345, + "time_per_iteration": 2.4848885536193848 + }, + { + "auxiliary_loss_clip": 0.01149248, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.04882598, + "balance_loss_mlp": 1.0194633, + "epoch": 0.6428184933565803, + "flos": 17851984456320.0, + "grad_norm": 4.701172722725984, + "language_loss": 0.796799, + "learning_rate": 1.1954078727569874e-06, + "loss": 0.81856716, + "num_input_tokens_seen": 115064965, + "step": 5346, + "time_per_iteration": 2.4554507732391357 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.0076226, + "balance_loss_clip": 1.04337668, + "balance_loss_mlp": 1.00056815, + "epoch": 0.6429387362472194, + "flos": 22456953820800.0, + "grad_norm": 1.6386806484125886, + "language_loss": 0.77965021, + "learning_rate": 1.1946947777607141e-06, + "loss": 0.79859573, + "num_input_tokens_seen": 115086100, + "step": 5347, + "time_per_iteration": 2.5322957038879395 + }, + { + "auxiliary_loss_clip": 0.01109016, + "auxiliary_loss_mlp": 0.01024265, + "balance_loss_clip": 1.04210496, + "balance_loss_mlp": 1.01631975, + "epoch": 0.6430589791378585, + "flos": 24752579005440.0, + "grad_norm": 1.8584480875619018, + "language_loss": 0.79986107, + "learning_rate": 1.1939818049286024e-06, + "loss": 0.82119393, + "num_input_tokens_seen": 115104260, + "step": 5348, + "time_per_iteration": 2.569805383682251 + }, + { + "auxiliary_loss_clip": 0.0108661, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.03920484, + "balance_loss_mlp": 1.0205524, + "epoch": 0.6431792220284975, + "flos": 24901249397760.0, + "grad_norm": 1.519905985688268, + "language_loss": 0.75588173, + "learning_rate": 1.1932689543688101e-06, + "loss": 0.77702647, + "num_input_tokens_seen": 115125365, + "step": 5349, + "time_per_iteration": 3.385685920715332 + }, + { + "auxiliary_loss_clip": 0.01143903, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.04815221, + "balance_loss_mlp": 1.01902127, + "epoch": 0.6432994649191367, + "flos": 21032305620480.0, + "grad_norm": 2.2725091381337816, + "language_loss": 0.7260707, + "learning_rate": 1.1925562261894756e-06, + "loss": 0.74777591, + "num_input_tokens_seen": 115144445, + "step": 5350, + "time_per_iteration": 2.4838509559631348 + }, + { + "auxiliary_loss_clip": 0.01139656, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.04515898, + "balance_loss_mlp": 1.02014184, + "epoch": 0.6434197078097758, + "flos": 30884433655680.0, + "grad_norm": 3.0386379653774607, + "language_loss": 0.77880096, + "learning_rate": 1.1918436204987207e-06, + "loss": 0.80047309, + "num_input_tokens_seen": 115166305, + "step": 5351, + "time_per_iteration": 2.5686447620391846 + }, + { + "auxiliary_loss_clip": 0.01155893, + "auxiliary_loss_mlp": 0.01024305, + "balance_loss_clip": 1.05091619, + "balance_loss_mlp": 1.01693153, + "epoch": 0.6435399507004148, + "flos": 15012492468480.0, + "grad_norm": 1.9502419607140733, + "language_loss": 0.81479049, + "learning_rate": 1.191131137404645e-06, + "loss": 0.83659244, + "num_input_tokens_seen": 115183045, + "step": 5352, + "time_per_iteration": 3.273716926574707 + }, + { + "auxiliary_loss_clip": 0.01116893, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.04344296, + "balance_loss_mlp": 1.02114797, + "epoch": 0.643660193591054, + "flos": 19901981462400.0, + "grad_norm": 2.0284984924712854, + "language_loss": 0.76935744, + "learning_rate": 1.190418777015333e-06, + "loss": 0.79080939, + "num_input_tokens_seen": 115201955, + "step": 5353, + "time_per_iteration": 2.5105373859405518 + }, + { + "auxiliary_loss_clip": 0.01143691, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_clip": 1.04684758, + "balance_loss_mlp": 1.01346767, + "epoch": 0.643780436481693, + "flos": 24133622820480.0, + "grad_norm": 1.4747283207564812, + "language_loss": 0.7382971, + "learning_rate": 1.1897065394388487e-06, + "loss": 0.75993991, + "num_input_tokens_seen": 115222395, + "step": 5354, + "time_per_iteration": 3.409649610519409 + }, + { + "auxiliary_loss_clip": 0.01143752, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.05037212, + "balance_loss_mlp": 1.02169657, + "epoch": 0.6439006793723321, + "flos": 23148808657920.0, + "grad_norm": 1.6125624525076288, + "language_loss": 0.76621556, + "learning_rate": 1.1889944247832385e-06, + "loss": 0.78794616, + "num_input_tokens_seen": 115242635, + "step": 5355, + "time_per_iteration": 3.2758243083953857 + }, + { + "auxiliary_loss_clip": 0.01157667, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.04512191, + "balance_loss_mlp": 1.02062535, + "epoch": 0.6440209222629713, + "flos": 23617909301760.0, + "grad_norm": 1.9098594187834383, + "language_loss": 0.70562905, + "learning_rate": 1.1882824331565283e-06, + "loss": 0.72748739, + "num_input_tokens_seen": 115262095, + "step": 5356, + "time_per_iteration": 2.521042823791504 + }, + { + "auxiliary_loss_clip": 0.01123631, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.04210019, + "balance_loss_mlp": 1.02037013, + "epoch": 0.6441411651536103, + "flos": 16544872535040.0, + "grad_norm": 2.014874191587914, + "language_loss": 0.89210314, + "learning_rate": 1.1875705646667287e-06, + "loss": 0.91361648, + "num_input_tokens_seen": 115279985, + "step": 5357, + "time_per_iteration": 2.487450122833252 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.04431057, + "balance_loss_mlp": 1.01861548, + "epoch": 0.6442614080442494, + "flos": 25410965345280.0, + "grad_norm": 1.9162354228599496, + "language_loss": 0.75268722, + "learning_rate": 1.1868588194218282e-06, + "loss": 0.77448559, + "num_input_tokens_seen": 115300365, + "step": 5358, + "time_per_iteration": 2.4772651195526123 + }, + { + "auxiliary_loss_clip": 0.01148005, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.04510999, + "balance_loss_mlp": 1.02232909, + "epoch": 0.6443816509348885, + "flos": 28294017552000.0, + "grad_norm": 1.6328647738082025, + "language_loss": 0.73876476, + "learning_rate": 1.1861471975297979e-06, + "loss": 0.76054585, + "num_input_tokens_seen": 115322060, + "step": 5359, + "time_per_iteration": 2.5494964122772217 + }, + { + "auxiliary_loss_clip": 0.01126722, + "auxiliary_loss_mlp": 0.01022555, + "balance_loss_clip": 1.04684877, + "balance_loss_mlp": 1.01459765, + "epoch": 0.6445018938255276, + "flos": 36690075964800.0, + "grad_norm": 1.6199011382781163, + "language_loss": 0.70958543, + "learning_rate": 1.185435699098591e-06, + "loss": 0.73107815, + "num_input_tokens_seen": 115348255, + "step": 5360, + "time_per_iteration": 2.69227933883667 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01023233, + "balance_loss_clip": 1.04590225, + "balance_loss_mlp": 1.0156008, + "epoch": 0.6446221367161666, + "flos": 14501411804160.0, + "grad_norm": 7.536487055703115, + "language_loss": 0.78403491, + "learning_rate": 1.1847243242361403e-06, + "loss": 0.80572605, + "num_input_tokens_seen": 115366845, + "step": 5361, + "time_per_iteration": 2.450035810470581 + }, + { + "auxiliary_loss_clip": 0.0114605, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.04786444, + "balance_loss_mlp": 1.02666068, + "epoch": 0.6447423796068057, + "flos": 24609367480320.0, + "grad_norm": 1.9725811768542592, + "language_loss": 0.7779808, + "learning_rate": 1.1840130730503624e-06, + "loss": 0.79978287, + "num_input_tokens_seen": 115388125, + "step": 5362, + "time_per_iteration": 2.5733940601348877 + }, + { + "auxiliary_loss_clip": 0.01171787, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.0492866, + "balance_loss_mlp": 1.01912951, + "epoch": 0.6448626224974449, + "flos": 25047298097280.0, + "grad_norm": 1.7002227044297158, + "language_loss": 0.74608403, + "learning_rate": 1.1833019456491518e-06, + "loss": 0.76806664, + "num_input_tokens_seen": 115409655, + "step": 5363, + "time_per_iteration": 2.517425298690796 + }, + { + "auxiliary_loss_clip": 0.01159534, + "auxiliary_loss_mlp": 0.01025886, + "balance_loss_clip": 1.04968572, + "balance_loss_mlp": 1.01842976, + "epoch": 0.6449828653880839, + "flos": 22530355263360.0, + "grad_norm": 2.1289093804855015, + "language_loss": 0.79254144, + "learning_rate": 1.1825909421403871e-06, + "loss": 0.81439567, + "num_input_tokens_seen": 115428750, + "step": 5364, + "time_per_iteration": 2.473006010055542 + }, + { + "auxiliary_loss_clip": 0.01156489, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.04714763, + "balance_loss_mlp": 1.02008557, + "epoch": 0.645103108278723, + "flos": 25695736369920.0, + "grad_norm": 1.6775703708739282, + "language_loss": 0.76189846, + "learning_rate": 1.181880062631926e-06, + "loss": 0.78373384, + "num_input_tokens_seen": 115448085, + "step": 5365, + "time_per_iteration": 2.4765913486480713 + }, + { + "auxiliary_loss_clip": 0.01138044, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.04679561, + "balance_loss_mlp": 1.02295315, + "epoch": 0.6452233511693621, + "flos": 27450331925760.0, + "grad_norm": 2.003137936354261, + "language_loss": 0.84850895, + "learning_rate": 1.1811693072316093e-06, + "loss": 0.87020272, + "num_input_tokens_seen": 115465765, + "step": 5366, + "time_per_iteration": 2.517852783203125 + }, + { + "auxiliary_loss_clip": 0.01169989, + "auxiliary_loss_mlp": 0.00762463, + "balance_loss_clip": 1.0467217, + "balance_loss_mlp": 1.00054502, + "epoch": 0.6453435940600012, + "flos": 19208618254080.0, + "grad_norm": 2.174860207167397, + "language_loss": 0.84323335, + "learning_rate": 1.1804586760472574e-06, + "loss": 0.86255783, + "num_input_tokens_seen": 115482230, + "step": 5367, + "time_per_iteration": 2.4059057235717773 + }, + { + "auxiliary_loss_clip": 0.01126319, + "auxiliary_loss_mlp": 0.01023378, + "balance_loss_clip": 1.0435679, + "balance_loss_mlp": 1.01556945, + "epoch": 0.6454638369506402, + "flos": 25737680476800.0, + "grad_norm": 2.2075010916483717, + "language_loss": 0.80187517, + "learning_rate": 1.1797481691866736e-06, + "loss": 0.82337213, + "num_input_tokens_seen": 115499455, + "step": 5368, + "time_per_iteration": 2.530470609664917 + }, + { + "auxiliary_loss_clip": 0.01136434, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.04761469, + "balance_loss_mlp": 1.02525771, + "epoch": 0.6455840798412794, + "flos": 20989176364800.0, + "grad_norm": 2.000739103417135, + "language_loss": 0.83167076, + "learning_rate": 1.1790377867576393e-06, + "loss": 0.85336113, + "num_input_tokens_seen": 115517205, + "step": 5369, + "time_per_iteration": 2.4924209117889404 + }, + { + "auxiliary_loss_clip": 0.01146224, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.04642403, + "balance_loss_mlp": 1.01892543, + "epoch": 0.6457043227319185, + "flos": 26067556005120.0, + "grad_norm": 2.1072539658293605, + "language_loss": 0.76444286, + "learning_rate": 1.1783275288679203e-06, + "loss": 0.78616786, + "num_input_tokens_seen": 115534370, + "step": 5370, + "time_per_iteration": 2.5288474559783936 + }, + { + "auxiliary_loss_clip": 0.01064654, + "auxiliary_loss_mlp": 0.01002359, + "balance_loss_clip": 1.01657021, + "balance_loss_mlp": 1.0013876, + "epoch": 0.6458245656225575, + "flos": 60370831088640.0, + "grad_norm": 0.8475207745816336, + "language_loss": 0.57112044, + "learning_rate": 1.177617395625262e-06, + "loss": 0.59179056, + "num_input_tokens_seen": 115592345, + "step": 5371, + "time_per_iteration": 3.0110514163970947 + }, + { + "auxiliary_loss_clip": 0.01156936, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.04955232, + "balance_loss_mlp": 1.02099466, + "epoch": 0.6459448085131967, + "flos": 23076771932160.0, + "grad_norm": 2.109194996184222, + "language_loss": 0.75144202, + "learning_rate": 1.1769073871373908e-06, + "loss": 0.77329534, + "num_input_tokens_seen": 115612550, + "step": 5372, + "time_per_iteration": 2.4809679985046387 + }, + { + "auxiliary_loss_clip": 0.01124834, + "auxiliary_loss_mlp": 0.01022894, + "balance_loss_clip": 1.04220545, + "balance_loss_mlp": 1.01571178, + "epoch": 0.6460650514038357, + "flos": 22598190097920.0, + "grad_norm": 1.6962795854723085, + "language_loss": 0.83681768, + "learning_rate": 1.176197503512015e-06, + "loss": 0.85829496, + "num_input_tokens_seen": 115632265, + "step": 5373, + "time_per_iteration": 2.5287370681762695 + }, + { + "auxiliary_loss_clip": 0.01140674, + "auxiliary_loss_mlp": 0.01026543, + "balance_loss_clip": 1.0466876, + "balance_loss_mlp": 1.01932752, + "epoch": 0.6461852942944748, + "flos": 20266726118400.0, + "grad_norm": 2.207626833863588, + "language_loss": 0.82171977, + "learning_rate": 1.1754877448568223e-06, + "loss": 0.8433919, + "num_input_tokens_seen": 115651720, + "step": 5374, + "time_per_iteration": 2.4975805282592773 + }, + { + "auxiliary_loss_clip": 0.01141588, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.04442525, + "balance_loss_mlp": 1.0178293, + "epoch": 0.646305537185114, + "flos": 23367109564800.0, + "grad_norm": 2.7149867036172752, + "language_loss": 0.89946866, + "learning_rate": 1.1747781112794837e-06, + "loss": 0.92113799, + "num_input_tokens_seen": 115668215, + "step": 5375, + "time_per_iteration": 3.359842538833618 + }, + { + "auxiliary_loss_clip": 0.01127006, + "auxiliary_loss_mlp": 0.0102665, + "balance_loss_clip": 1.04610848, + "balance_loss_mlp": 1.01922929, + "epoch": 0.646425780075753, + "flos": 24277480790400.0, + "grad_norm": 1.814659732872634, + "language_loss": 0.82686841, + "learning_rate": 1.1740686028876487e-06, + "loss": 0.848405, + "num_input_tokens_seen": 115687080, + "step": 5376, + "time_per_iteration": 2.5411336421966553 + }, + { + "auxiliary_loss_clip": 0.01152356, + "auxiliary_loss_mlp": 0.01023774, + "balance_loss_clip": 1.0474751, + "balance_loss_mlp": 1.01667261, + "epoch": 0.6465460229663921, + "flos": 20813968800000.0, + "grad_norm": 3.232957618848814, + "language_loss": 0.74605536, + "learning_rate": 1.1733592197889507e-06, + "loss": 0.76781666, + "num_input_tokens_seen": 115703990, + "step": 5377, + "time_per_iteration": 2.4445273876190186 + }, + { + "auxiliary_loss_clip": 0.01149306, + "auxiliary_loss_mlp": 0.0102324, + "balance_loss_clip": 1.04729044, + "balance_loss_mlp": 1.01636434, + "epoch": 0.6466662658570312, + "flos": 22853299466880.0, + "grad_norm": 1.902984090622666, + "language_loss": 0.72670501, + "learning_rate": 1.1726499620910014e-06, + "loss": 0.74843043, + "num_input_tokens_seen": 115724270, + "step": 5378, + "time_per_iteration": 2.4676947593688965 + }, + { + "auxiliary_loss_clip": 0.01155216, + "auxiliary_loss_mlp": 0.0102417, + "balance_loss_clip": 1.04725885, + "balance_loss_mlp": 1.01644814, + "epoch": 0.6467865087476703, + "flos": 15304553953920.0, + "grad_norm": 1.9012010274018538, + "language_loss": 0.77490056, + "learning_rate": 1.1719408299013955e-06, + "loss": 0.7966944, + "num_input_tokens_seen": 115742995, + "step": 5379, + "time_per_iteration": 3.349086046218872 + }, + { + "auxiliary_loss_clip": 0.01170034, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.05068803, + "balance_loss_mlp": 1.0214268, + "epoch": 0.6469067516383094, + "flos": 19573650218880.0, + "grad_norm": 2.210741846198485, + "language_loss": 0.75815099, + "learning_rate": 1.1712318233277067e-06, + "loss": 0.7801401, + "num_input_tokens_seen": 115762015, + "step": 5380, + "time_per_iteration": 3.2264480590820312 + }, + { + "auxiliary_loss_clip": 0.01063531, + "auxiliary_loss_mlp": 0.01001287, + "balance_loss_clip": 1.01692963, + "balance_loss_mlp": 1.00033379, + "epoch": 0.6470269945289485, + "flos": 65098002522240.0, + "grad_norm": 0.7497049174802937, + "language_loss": 0.57815784, + "learning_rate": 1.1705229424774916e-06, + "loss": 0.59880602, + "num_input_tokens_seen": 115816285, + "step": 5381, + "time_per_iteration": 3.631687641143799 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.04396987, + "balance_loss_mlp": 1.02137303, + "epoch": 0.6471472374195876, + "flos": 30696943639680.0, + "grad_norm": 1.6849529909741638, + "language_loss": 0.64126766, + "learning_rate": 1.1698141874582867e-06, + "loss": 0.66294122, + "num_input_tokens_seen": 115837330, + "step": 5382, + "time_per_iteration": 2.56235671043396 + }, + { + "auxiliary_loss_clip": 0.01168809, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.04950714, + "balance_loss_mlp": 1.02075112, + "epoch": 0.6472674803102266, + "flos": 20521835487360.0, + "grad_norm": 2.069375863564844, + "language_loss": 0.72142899, + "learning_rate": 1.169105558377609e-06, + "loss": 0.74339116, + "num_input_tokens_seen": 115857420, + "step": 5383, + "time_per_iteration": 2.416995048522949 + }, + { + "auxiliary_loss_clip": 0.01118094, + "auxiliary_loss_mlp": 0.00762362, + "balance_loss_clip": 1.04979205, + "balance_loss_mlp": 1.00058675, + "epoch": 0.6473877232008658, + "flos": 24715447320960.0, + "grad_norm": 1.6796468554461612, + "language_loss": 0.78325164, + "learning_rate": 1.1683970553429587e-06, + "loss": 0.80205619, + "num_input_tokens_seen": 115878875, + "step": 5384, + "time_per_iteration": 2.5850071907043457 + }, + { + "auxiliary_loss_clip": 0.01130428, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.04514277, + "balance_loss_mlp": 1.01936769, + "epoch": 0.6475079660915048, + "flos": 15885552441600.0, + "grad_norm": 3.2797018940203744, + "language_loss": 0.8169831, + "learning_rate": 1.1676886784618128e-06, + "loss": 0.83855677, + "num_input_tokens_seen": 115895540, + "step": 5385, + "time_per_iteration": 2.4666833877563477 + }, + { + "auxiliary_loss_clip": 0.01157426, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.04839325, + "balance_loss_mlp": 1.01796043, + "epoch": 0.6476282089821439, + "flos": 17381590922880.0, + "grad_norm": 2.3627084171020756, + "language_loss": 0.83817947, + "learning_rate": 1.1669804278416332e-06, + "loss": 0.86001039, + "num_input_tokens_seen": 115910265, + "step": 5386, + "time_per_iteration": 2.4138023853302 + }, + { + "auxiliary_loss_clip": 0.01147047, + "auxiliary_loss_mlp": 0.01026758, + "balance_loss_clip": 1.04809904, + "balance_loss_mlp": 1.01897407, + "epoch": 0.6477484518727831, + "flos": 20194078861440.0, + "grad_norm": 1.9173957365869634, + "language_loss": 0.71284294, + "learning_rate": 1.1662723035898602e-06, + "loss": 0.73458099, + "num_input_tokens_seen": 115930025, + "step": 5387, + "time_per_iteration": 2.4842822551727295 + }, + { + "auxiliary_loss_clip": 0.01155594, + "auxiliary_loss_mlp": 0.01023982, + "balance_loss_clip": 1.04855776, + "balance_loss_mlp": 1.01643562, + "epoch": 0.6478686947634221, + "flos": 25410426641280.0, + "grad_norm": 1.5864928088235273, + "language_loss": 0.8176682, + "learning_rate": 1.165564305813915e-06, + "loss": 0.83946395, + "num_input_tokens_seen": 115949025, + "step": 5388, + "time_per_iteration": 2.4786365032196045 + }, + { + "auxiliary_loss_clip": 0.01156247, + "auxiliary_loss_mlp": 0.01024858, + "balance_loss_clip": 1.04869533, + "balance_loss_mlp": 1.01760387, + "epoch": 0.6479889376540612, + "flos": 20083581648000.0, + "grad_norm": 1.7424551227277802, + "language_loss": 0.81379735, + "learning_rate": 1.1648564346212019e-06, + "loss": 0.83560842, + "num_input_tokens_seen": 115968145, + "step": 5389, + "time_per_iteration": 2.4599738121032715 + }, + { + "auxiliary_loss_clip": 0.01152876, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.04916835, + "balance_loss_mlp": 1.02048755, + "epoch": 0.6481091805447003, + "flos": 26758082039040.0, + "grad_norm": 2.0962209175181616, + "language_loss": 0.76356912, + "learning_rate": 1.164148690119104e-06, + "loss": 0.78537464, + "num_input_tokens_seen": 115989425, + "step": 5390, + "time_per_iteration": 2.4863195419311523 + }, + { + "auxiliary_loss_clip": 0.01167658, + "auxiliary_loss_mlp": 0.01025513, + "balance_loss_clip": 1.04841769, + "balance_loss_mlp": 1.01823568, + "epoch": 0.6482294234353394, + "flos": 23952094462080.0, + "grad_norm": 2.0303993663255695, + "language_loss": 0.74334908, + "learning_rate": 1.163441072414985e-06, + "loss": 0.76528084, + "num_input_tokens_seen": 116009630, + "step": 5391, + "time_per_iteration": 2.442458391189575 + }, + { + "auxiliary_loss_clip": 0.01156777, + "auxiliary_loss_mlp": 0.01024132, + "balance_loss_clip": 1.04952657, + "balance_loss_mlp": 1.01689279, + "epoch": 0.6483496663259785, + "flos": 26209833776640.0, + "grad_norm": 1.984044679603604, + "language_loss": 0.69866383, + "learning_rate": 1.16273358161619e-06, + "loss": 0.72047293, + "num_input_tokens_seen": 116029965, + "step": 5392, + "time_per_iteration": 2.493894577026367 + }, + { + "auxiliary_loss_clip": 0.0115253, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.04963517, + "balance_loss_mlp": 1.01916242, + "epoch": 0.6484699092166175, + "flos": 20922239370240.0, + "grad_norm": 1.934568824094898, + "language_loss": 0.83545816, + "learning_rate": 1.1620262178300446e-06, + "loss": 0.8572486, + "num_input_tokens_seen": 116048580, + "step": 5393, + "time_per_iteration": 2.4673571586608887 + }, + { + "auxiliary_loss_clip": 0.0112838, + "auxiliary_loss_mlp": 0.01024479, + "balance_loss_clip": 1.04354692, + "balance_loss_mlp": 1.01718903, + "epoch": 0.6485901521072567, + "flos": 33072865678080.0, + "grad_norm": 2.366200624095203, + "language_loss": 0.75639564, + "learning_rate": 1.1613189811638563e-06, + "loss": 0.77792418, + "num_input_tokens_seen": 116070305, + "step": 5394, + "time_per_iteration": 2.6324853897094727 + }, + { + "auxiliary_loss_clip": 0.01158177, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.04957438, + "balance_loss_mlp": 1.02020764, + "epoch": 0.6487103949978957, + "flos": 22274060745600.0, + "grad_norm": 1.5125506975357599, + "language_loss": 0.77831137, + "learning_rate": 1.1606118717249117e-06, + "loss": 0.80016541, + "num_input_tokens_seen": 116090405, + "step": 5395, + "time_per_iteration": 2.4912984371185303 + }, + { + "auxiliary_loss_clip": 0.01173863, + "auxiliary_loss_mlp": 0.01024893, + "balance_loss_clip": 1.04964268, + "balance_loss_mlp": 1.0173229, + "epoch": 0.6488306378885348, + "flos": 22930400010240.0, + "grad_norm": 1.8538992817476412, + "language_loss": 0.67611563, + "learning_rate": 1.1599048896204787e-06, + "loss": 0.69810319, + "num_input_tokens_seen": 116110285, + "step": 5396, + "time_per_iteration": 2.4224696159362793 + }, + { + "auxiliary_loss_clip": 0.0113375, + "auxiliary_loss_mlp": 0.01026279, + "balance_loss_clip": 1.0474118, + "balance_loss_mlp": 1.01890576, + "epoch": 0.648950880779174, + "flos": 20376110010240.0, + "grad_norm": 1.86818198474614, + "language_loss": 0.80856854, + "learning_rate": 1.1591980349578061e-06, + "loss": 0.83016884, + "num_input_tokens_seen": 116128955, + "step": 5397, + "time_per_iteration": 2.5147154331207275 + }, + { + "auxiliary_loss_clip": 0.01038576, + "auxiliary_loss_mlp": 0.01001414, + "balance_loss_clip": 1.01368618, + "balance_loss_mlp": 1.00051403, + "epoch": 0.649071123669813, + "flos": 59930889310080.0, + "grad_norm": 0.7370653548124485, + "language_loss": 0.54303443, + "learning_rate": 1.158491307844123e-06, + "loss": 0.5634343, + "num_input_tokens_seen": 116188875, + "step": 5398, + "time_per_iteration": 3.062973976135254 + }, + { + "auxiliary_loss_clip": 0.01143135, + "auxiliary_loss_mlp": 0.01024168, + "balance_loss_clip": 1.04818606, + "balance_loss_mlp": 1.0168488, + "epoch": 0.6491913665604521, + "flos": 20446566537600.0, + "grad_norm": 1.7038105458207862, + "language_loss": 0.83832473, + "learning_rate": 1.1577847083866387e-06, + "loss": 0.85999775, + "num_input_tokens_seen": 116207910, + "step": 5399, + "time_per_iteration": 2.464677333831787 + }, + { + "auxiliary_loss_clip": 0.01132292, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.04479098, + "balance_loss_mlp": 1.0187726, + "epoch": 0.6493116094510912, + "flos": 16946820702720.0, + "grad_norm": 1.855177009785709, + "language_loss": 0.71696377, + "learning_rate": 1.1570782366925453e-06, + "loss": 0.73855281, + "num_input_tokens_seen": 116226425, + "step": 5400, + "time_per_iteration": 2.452134847640991 + }, + { + "auxiliary_loss_clip": 0.01142262, + "auxiliary_loss_mlp": 0.01023291, + "balance_loss_clip": 1.04279935, + "balance_loss_mlp": 1.01570308, + "epoch": 0.6494318523417303, + "flos": 18802935072000.0, + "grad_norm": 1.8018560466709699, + "language_loss": 0.75824016, + "learning_rate": 1.1563718928690132e-06, + "loss": 0.77989572, + "num_input_tokens_seen": 116243860, + "step": 5401, + "time_per_iteration": 2.458969831466675 + }, + { + "auxiliary_loss_clip": 0.01125747, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.04444623, + "balance_loss_mlp": 1.0197736, + "epoch": 0.6495520952323693, + "flos": 18982847318400.0, + "grad_norm": 2.1628364599989522, + "language_loss": 0.716272, + "learning_rate": 1.1556656770231942e-06, + "loss": 0.73780447, + "num_input_tokens_seen": 116260055, + "step": 5402, + "time_per_iteration": 3.3266797065734863 + }, + { + "auxiliary_loss_clip": 0.01158118, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.04811358, + "balance_loss_mlp": 1.02047014, + "epoch": 0.6496723381230085, + "flos": 22745388032640.0, + "grad_norm": 1.6160907475737125, + "language_loss": 0.76413393, + "learning_rate": 1.1549595892622207e-06, + "loss": 0.78598881, + "num_input_tokens_seen": 116278825, + "step": 5403, + "time_per_iteration": 2.4608218669891357 + }, + { + "auxiliary_loss_clip": 0.01023799, + "auxiliary_loss_mlp": 0.01003853, + "balance_loss_clip": 1.01659656, + "balance_loss_mlp": 1.00304842, + "epoch": 0.6497925810136476, + "flos": 62145283887360.0, + "grad_norm": 0.8366120045218466, + "language_loss": 0.59027404, + "learning_rate": 1.1542536296932047e-06, + "loss": 0.61055058, + "num_input_tokens_seen": 116342360, + "step": 5404, + "time_per_iteration": 3.0789506435394287 + }, + { + "auxiliary_loss_clip": 0.01133697, + "auxiliary_loss_mlp": 0.01026499, + "balance_loss_clip": 1.04367888, + "balance_loss_mlp": 1.01855385, + "epoch": 0.6499128239042866, + "flos": 20156731695360.0, + "grad_norm": 1.5949901860122606, + "language_loss": 0.69981307, + "learning_rate": 1.1535477984232414e-06, + "loss": 0.72141504, + "num_input_tokens_seen": 116362235, + "step": 5405, + "time_per_iteration": 2.505502223968506 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01027173, + "balance_loss_clip": 1.03943741, + "balance_loss_mlp": 1.01974607, + "epoch": 0.6500330667949258, + "flos": 24462420940800.0, + "grad_norm": 1.741601822144197, + "language_loss": 0.77370203, + "learning_rate": 1.152842095559404e-06, + "loss": 0.79513645, + "num_input_tokens_seen": 116382895, + "step": 5406, + "time_per_iteration": 4.2207725048065186 + }, + { + "auxiliary_loss_clip": 0.01146971, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.04587376, + "balance_loss_mlp": 1.0186286, + "epoch": 0.6501533096855648, + "flos": 25477399549440.0, + "grad_norm": 1.6914510232454572, + "language_loss": 0.76452059, + "learning_rate": 1.1521365212087474e-06, + "loss": 0.78624749, + "num_input_tokens_seen": 116402880, + "step": 5407, + "time_per_iteration": 2.523043632507324 + }, + { + "auxiliary_loss_clip": 0.01155994, + "auxiliary_loss_mlp": 0.01024231, + "balance_loss_clip": 1.04617023, + "balance_loss_mlp": 1.0165782, + "epoch": 0.6502735525762039, + "flos": 44819245347840.0, + "grad_norm": 1.5923291623948574, + "language_loss": 0.70774889, + "learning_rate": 1.1514310754783062e-06, + "loss": 0.7295512, + "num_input_tokens_seen": 116425830, + "step": 5408, + "time_per_iteration": 3.4084830284118652 + }, + { + "auxiliary_loss_clip": 0.01145397, + "auxiliary_loss_mlp": 0.01023532, + "balance_loss_clip": 1.04765832, + "balance_loss_mlp": 1.0156939, + "epoch": 0.6503937954668431, + "flos": 28658546726400.0, + "grad_norm": 1.8707926288294439, + "language_loss": 0.73297626, + "learning_rate": 1.1507257584750964e-06, + "loss": 0.75466549, + "num_input_tokens_seen": 116446010, + "step": 5409, + "time_per_iteration": 2.589836597442627 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01025641, + "balance_loss_clip": 1.04959369, + "balance_loss_mlp": 1.01791, + "epoch": 0.6505140383574821, + "flos": 20922562592640.0, + "grad_norm": 1.9796961251777545, + "language_loss": 0.77842784, + "learning_rate": 1.150020570306113e-06, + "loss": 0.80039865, + "num_input_tokens_seen": 116465150, + "step": 5410, + "time_per_iteration": 2.410038471221924 + }, + { + "auxiliary_loss_clip": 0.01135092, + "auxiliary_loss_mlp": 0.0102527, + "balance_loss_clip": 1.04149389, + "balance_loss_mlp": 1.01749802, + "epoch": 0.6506342812481212, + "flos": 20595236929920.0, + "grad_norm": 2.6853216891952445, + "language_loss": 0.74958539, + "learning_rate": 1.1493155110783338e-06, + "loss": 0.77118903, + "num_input_tokens_seen": 116483675, + "step": 5411, + "time_per_iteration": 2.4797093868255615 + }, + { + "auxiliary_loss_clip": 0.01155657, + "auxiliary_loss_mlp": 0.01024005, + "balance_loss_clip": 1.04776502, + "balance_loss_mlp": 1.01623249, + "epoch": 0.6507545241387603, + "flos": 30226478279040.0, + "grad_norm": 2.6621805251068626, + "language_loss": 0.70573229, + "learning_rate": 1.1486105808987155e-06, + "loss": 0.72752893, + "num_input_tokens_seen": 116505165, + "step": 5412, + "time_per_iteration": 2.545302629470825 + }, + { + "auxiliary_loss_clip": 0.01158894, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_clip": 1.04960871, + "balance_loss_mlp": 1.01298499, + "epoch": 0.6508747670293994, + "flos": 17128241320320.0, + "grad_norm": 1.825299864888889, + "language_loss": 0.81571674, + "learning_rate": 1.1479057798741947e-06, + "loss": 0.8375116, + "num_input_tokens_seen": 116523220, + "step": 5413, + "time_per_iteration": 2.437554121017456 + }, + { + "auxiliary_loss_clip": 0.01054643, + "auxiliary_loss_mlp": 0.01009326, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.00810421, + "epoch": 0.6509950099200384, + "flos": 68559826573440.0, + "grad_norm": 0.7909187214930529, + "language_loss": 0.53340364, + "learning_rate": 1.14720110811169e-06, + "loss": 0.55404335, + "num_input_tokens_seen": 116580450, + "step": 5414, + "time_per_iteration": 3.0831449031829834 + }, + { + "auxiliary_loss_clip": 0.01161779, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.049137, + "balance_loss_mlp": 1.01983345, + "epoch": 0.6511152528106776, + "flos": 22347462188160.0, + "grad_norm": 2.1337486389609532, + "language_loss": 0.76643193, + "learning_rate": 1.146496565718098e-06, + "loss": 0.78832436, + "num_input_tokens_seen": 116601020, + "step": 5415, + "time_per_iteration": 2.465686559677124 + }, + { + "auxiliary_loss_clip": 0.01147364, + "auxiliary_loss_mlp": 0.01026344, + "balance_loss_clip": 1.05141306, + "balance_loss_mlp": 1.01833868, + "epoch": 0.6512354957013167, + "flos": 20522158709760.0, + "grad_norm": 1.9173268352475563, + "language_loss": 0.75976503, + "learning_rate": 1.1457921528002996e-06, + "loss": 0.78150207, + "num_input_tokens_seen": 116619455, + "step": 5416, + "time_per_iteration": 2.4744672775268555 + }, + { + "auxiliary_loss_clip": 0.01171561, + "auxiliary_loss_mlp": 0.00762552, + "balance_loss_clip": 1.05003142, + "balance_loss_mlp": 1.00070596, + "epoch": 0.6513557385919557, + "flos": 32337342881280.0, + "grad_norm": 2.2128519736824797, + "language_loss": 0.72480196, + "learning_rate": 1.1450878694651522e-06, + "loss": 0.74414313, + "num_input_tokens_seen": 116640020, + "step": 5417, + "time_per_iteration": 2.5107059478759766 + }, + { + "auxiliary_loss_clip": 0.01114932, + "auxiliary_loss_mlp": 0.01023737, + "balance_loss_clip": 1.04161572, + "balance_loss_mlp": 1.0161196, + "epoch": 0.6514759814825949, + "flos": 12093206417280.0, + "grad_norm": 7.295446121662826, + "language_loss": 0.62675995, + "learning_rate": 1.1443837158194954e-06, + "loss": 0.64814663, + "num_input_tokens_seen": 116655165, + "step": 5418, + "time_per_iteration": 2.500206708908081 + }, + { + "auxiliary_loss_clip": 0.01133119, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.05281436, + "balance_loss_mlp": 1.02090216, + "epoch": 0.651596224373234, + "flos": 22526907557760.0, + "grad_norm": 1.880254572797962, + "language_loss": 0.74593651, + "learning_rate": 1.1436796919701484e-06, + "loss": 0.76755726, + "num_input_tokens_seen": 116673880, + "step": 5419, + "time_per_iteration": 2.518559694290161 + }, + { + "auxiliary_loss_clip": 0.01145284, + "auxiliary_loss_mlp": 0.01023396, + "balance_loss_clip": 1.05036473, + "balance_loss_mlp": 1.01603508, + "epoch": 0.651716467263873, + "flos": 27818955250560.0, + "grad_norm": 1.9222445285583931, + "language_loss": 0.61628288, + "learning_rate": 1.1429757980239115e-06, + "loss": 0.63796973, + "num_input_tokens_seen": 116694305, + "step": 5420, + "time_per_iteration": 2.5443460941314697 + }, + { + "auxiliary_loss_clip": 0.01171686, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.049119, + "balance_loss_mlp": 1.02485895, + "epoch": 0.6518367101545122, + "flos": 24316300414080.0, + "grad_norm": 2.2601579156734855, + "language_loss": 0.81650531, + "learning_rate": 1.1422720340875636e-06, + "loss": 0.838552, + "num_input_tokens_seen": 116713055, + "step": 5421, + "time_per_iteration": 2.431621789932251 + }, + { + "auxiliary_loss_clip": 0.01162808, + "auxiliary_loss_mlp": 0.01026526, + "balance_loss_clip": 1.04787552, + "balance_loss_mlp": 1.01940084, + "epoch": 0.6519569530451512, + "flos": 20011939971840.0, + "grad_norm": 1.9156298586850238, + "language_loss": 0.79172671, + "learning_rate": 1.1415684002678671e-06, + "loss": 0.81362003, + "num_input_tokens_seen": 116731815, + "step": 5422, + "time_per_iteration": 2.43428373336792 + }, + { + "auxiliary_loss_clip": 0.01146, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.04561782, + "balance_loss_mlp": 1.0227654, + "epoch": 0.6520771959357903, + "flos": 21576064682880.0, + "grad_norm": 2.4552864604426814, + "language_loss": 0.77722645, + "learning_rate": 1.1408648966715617e-06, + "loss": 0.79899329, + "num_input_tokens_seen": 116749335, + "step": 5423, + "time_per_iteration": 2.48447585105896 + }, + { + "auxiliary_loss_clip": 0.01142831, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.04315734, + "balance_loss_mlp": 1.02101958, + "epoch": 0.6521974388264293, + "flos": 22711021695360.0, + "grad_norm": 1.7686189723053143, + "language_loss": 0.72463131, + "learning_rate": 1.1401615234053683e-06, + "loss": 0.74634498, + "num_input_tokens_seen": 116768155, + "step": 5424, + "time_per_iteration": 2.475473403930664 + }, + { + "auxiliary_loss_clip": 0.01144117, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.04617238, + "balance_loss_mlp": 1.02150536, + "epoch": 0.6523176817170685, + "flos": 23002939526400.0, + "grad_norm": 2.013088659868647, + "language_loss": 0.75732207, + "learning_rate": 1.1394582805759885e-06, + "loss": 0.77905393, + "num_input_tokens_seen": 116787435, + "step": 5425, + "time_per_iteration": 2.492194652557373 + }, + { + "auxiliary_loss_clip": 0.01158779, + "auxiliary_loss_mlp": 0.01029036, + "balance_loss_clip": 1.04983258, + "balance_loss_mlp": 1.02158237, + "epoch": 0.6524379246077076, + "flos": 21688249835520.0, + "grad_norm": 1.6860771855964514, + "language_loss": 0.75490665, + "learning_rate": 1.1387551682901022e-06, + "loss": 0.77678478, + "num_input_tokens_seen": 116808040, + "step": 5426, + "time_per_iteration": 2.4582362174987793 + }, + { + "auxiliary_loss_clip": 0.01126798, + "auxiliary_loss_mlp": 0.01025834, + "balance_loss_clip": 1.04531419, + "balance_loss_mlp": 1.01857066, + "epoch": 0.6525581674983466, + "flos": 19390936711680.0, + "grad_norm": 3.468677449449453, + "language_loss": 0.70685619, + "learning_rate": 1.138052186654373e-06, + "loss": 0.72838247, + "num_input_tokens_seen": 116825510, + "step": 5427, + "time_per_iteration": 2.504934310913086 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01023189, + "balance_loss_clip": 1.04741454, + "balance_loss_mlp": 1.01493907, + "epoch": 0.6526784103889858, + "flos": 17165444832000.0, + "grad_norm": 2.2006353470320477, + "language_loss": 0.88082296, + "learning_rate": 1.1373493357754417e-06, + "loss": 0.90251678, + "num_input_tokens_seen": 116844415, + "step": 5428, + "time_per_iteration": 2.456697940826416 + }, + { + "auxiliary_loss_clip": 0.01169934, + "auxiliary_loss_mlp": 0.01021271, + "balance_loss_clip": 1.04770613, + "balance_loss_mlp": 1.01436329, + "epoch": 0.6527986532796248, + "flos": 18989168112000.0, + "grad_norm": 1.7605706431425618, + "language_loss": 0.77212018, + "learning_rate": 1.1366466157599303e-06, + "loss": 0.79403222, + "num_input_tokens_seen": 116863690, + "step": 5429, + "time_per_iteration": 3.1312551498413086 + }, + { + "auxiliary_loss_clip": 0.01111853, + "auxiliary_loss_mlp": 0.00762844, + "balance_loss_clip": 1.04262304, + "balance_loss_mlp": 1.00061369, + "epoch": 0.6529188961702639, + "flos": 14238581011200.0, + "grad_norm": 2.2214907247739797, + "language_loss": 0.76363444, + "learning_rate": 1.1359440267144412e-06, + "loss": 0.78238142, + "num_input_tokens_seen": 116881145, + "step": 5430, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01159768, + "auxiliary_loss_mlp": 0.01022424, + "balance_loss_clip": 1.04864776, + "balance_loss_mlp": 1.0154984, + "epoch": 0.653039139060903, + "flos": 36682929158400.0, + "grad_norm": 1.9871571570191675, + "language_loss": 0.74024165, + "learning_rate": 1.1352415687455556e-06, + "loss": 0.76206356, + "num_input_tokens_seen": 116902405, + "step": 5431, + "time_per_iteration": 2.5878005027770996 + }, + { + "auxiliary_loss_clip": 0.01158132, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.04976785, + "balance_loss_mlp": 1.02188492, + "epoch": 0.6531593819515421, + "flos": 25376275785600.0, + "grad_norm": 2.3825868980395932, + "language_loss": 0.63941169, + "learning_rate": 1.1345392419598362e-06, + "loss": 0.66128504, + "num_input_tokens_seen": 116921285, + "step": 5432, + "time_per_iteration": 3.373856782913208 + }, + { + "auxiliary_loss_clip": 0.01151061, + "auxiliary_loss_mlp": 0.01020679, + "balance_loss_clip": 1.04601049, + "balance_loss_mlp": 1.01269817, + "epoch": 0.6532796248421812, + "flos": 21178533888000.0, + "grad_norm": 1.6477537244735991, + "language_loss": 0.71806282, + "learning_rate": 1.1338370464638263e-06, + "loss": 0.73978025, + "num_input_tokens_seen": 116940685, + "step": 5433, + "time_per_iteration": 3.294677495956421 + }, + { + "auxiliary_loss_clip": 0.01169666, + "auxiliary_loss_mlp": 0.01020712, + "balance_loss_clip": 1.04741144, + "balance_loss_mlp": 1.01335645, + "epoch": 0.6533998677328203, + "flos": 17675950878720.0, + "grad_norm": 2.336478210524243, + "language_loss": 0.64025503, + "learning_rate": 1.1331349823640474e-06, + "loss": 0.66215879, + "num_input_tokens_seen": 116958115, + "step": 5434, + "time_per_iteration": 2.3974545001983643 + }, + { + "auxiliary_loss_clip": 0.01157395, + "auxiliary_loss_mlp": 0.00761877, + "balance_loss_clip": 1.04690075, + "balance_loss_mlp": 1.00064099, + "epoch": 0.6535201106234594, + "flos": 28400384701440.0, + "grad_norm": 2.3189662681660304, + "language_loss": 0.78374422, + "learning_rate": 1.132433049767003e-06, + "loss": 0.80293697, + "num_input_tokens_seen": 116976030, + "step": 5435, + "time_per_iteration": 3.2574713230133057 + }, + { + "auxiliary_loss_clip": 0.01141604, + "auxiliary_loss_mlp": 0.01025959, + "balance_loss_clip": 1.04689407, + "balance_loss_mlp": 1.01935434, + "epoch": 0.6536403535140984, + "flos": 23586667447680.0, + "grad_norm": 1.5918954219305315, + "language_loss": 0.80939412, + "learning_rate": 1.1317312487791748e-06, + "loss": 0.83106983, + "num_input_tokens_seen": 116997680, + "step": 5436, + "time_per_iteration": 2.504192352294922 + }, + { + "auxiliary_loss_clip": 0.01151214, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.04646468, + "balance_loss_mlp": 1.02033031, + "epoch": 0.6537605964047376, + "flos": 21579476474880.0, + "grad_norm": 3.93432374522333, + "language_loss": 0.73504812, + "learning_rate": 1.1310295795070253e-06, + "loss": 0.75683784, + "num_input_tokens_seen": 117017620, + "step": 5437, + "time_per_iteration": 2.4524824619293213 + }, + { + "auxiliary_loss_clip": 0.01119438, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.04329741, + "balance_loss_mlp": 1.02091265, + "epoch": 0.6538808392953767, + "flos": 26833997433600.0, + "grad_norm": 1.8605797673593854, + "language_loss": 0.80882508, + "learning_rate": 1.1303280420569982e-06, + "loss": 0.83030462, + "num_input_tokens_seen": 117039505, + "step": 5438, + "time_per_iteration": 2.5812337398529053 + }, + { + "auxiliary_loss_clip": 0.01152315, + "auxiliary_loss_mlp": 0.01023282, + "balance_loss_clip": 1.04673338, + "balance_loss_mlp": 1.01579022, + "epoch": 0.6540010821860157, + "flos": 30738241301760.0, + "grad_norm": 1.6635113604138412, + "language_loss": 0.77096975, + "learning_rate": 1.1296266365355158e-06, + "loss": 0.79272574, + "num_input_tokens_seen": 117062890, + "step": 5439, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.01131821, + "auxiliary_loss_mlp": 0.01022346, + "balance_loss_clip": 1.0462091, + "balance_loss_mlp": 1.01437128, + "epoch": 0.6541213250766549, + "flos": 26907147480960.0, + "grad_norm": 1.8552373161488265, + "language_loss": 0.73964345, + "learning_rate": 1.1289253630489806e-06, + "loss": 0.76118511, + "num_input_tokens_seen": 117083940, + "step": 5440, + "time_per_iteration": 2.552569627761841 + }, + { + "auxiliary_loss_clip": 0.01161062, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.04713321, + "balance_loss_mlp": 1.02129793, + "epoch": 0.6542415679672939, + "flos": 19172384409600.0, + "grad_norm": 2.275298212771805, + "language_loss": 0.72369605, + "learning_rate": 1.1282242217037753e-06, + "loss": 0.74559891, + "num_input_tokens_seen": 117101440, + "step": 5441, + "time_per_iteration": 2.428621530532837 + }, + { + "auxiliary_loss_clip": 0.01110524, + "auxiliary_loss_mlp": 0.01024917, + "balance_loss_clip": 1.03989649, + "balance_loss_mlp": 1.01717472, + "epoch": 0.654361810857933, + "flos": 48173517100800.0, + "grad_norm": 3.4533814683365014, + "language_loss": 0.61571622, + "learning_rate": 1.127523212606262e-06, + "loss": 0.63707066, + "num_input_tokens_seen": 117124265, + "step": 5442, + "time_per_iteration": 2.7673609256744385 + }, + { + "auxiliary_loss_clip": 0.01156203, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.04887748, + "balance_loss_mlp": 1.01908684, + "epoch": 0.6544820537485722, + "flos": 26943165843840.0, + "grad_norm": 1.5330886653022804, + "language_loss": 0.72986728, + "learning_rate": 1.1268223358627835e-06, + "loss": 0.75169533, + "num_input_tokens_seen": 117146755, + "step": 5443, + "time_per_iteration": 2.502192974090576 + }, + { + "auxiliary_loss_clip": 0.01170755, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.04827738, + "balance_loss_mlp": 1.01768541, + "epoch": 0.6546022966392112, + "flos": 20886328748160.0, + "grad_norm": 1.7444612670351287, + "language_loss": 0.72069454, + "learning_rate": 1.126121591579663e-06, + "loss": 0.74265575, + "num_input_tokens_seen": 117165960, + "step": 5444, + "time_per_iteration": 2.4083096981048584 + }, + { + "auxiliary_loss_clip": 0.01153383, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_clip": 1.04902899, + "balance_loss_mlp": 1.01859808, + "epoch": 0.6547225395298503, + "flos": 24936693143040.0, + "grad_norm": 1.52777908215882, + "language_loss": 0.68855822, + "learning_rate": 1.1254209798632018e-06, + "loss": 0.71035278, + "num_input_tokens_seen": 117186980, + "step": 5445, + "time_per_iteration": 2.481923818588257 + }, + { + "auxiliary_loss_clip": 0.01088637, + "auxiliary_loss_mlp": 0.01022439, + "balance_loss_clip": 1.0399189, + "balance_loss_mlp": 1.01489317, + "epoch": 0.6548427824204894, + "flos": 22565942663040.0, + "grad_norm": 1.5885633187903272, + "language_loss": 0.84326208, + "learning_rate": 1.124720500819683e-06, + "loss": 0.86437285, + "num_input_tokens_seen": 117205135, + "step": 5446, + "time_per_iteration": 2.578939199447632 + }, + { + "auxiliary_loss_clip": 0.01175364, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.05280805, + "balance_loss_mlp": 1.02147877, + "epoch": 0.6549630253111285, + "flos": 18442500048000.0, + "grad_norm": 1.8464784197272823, + "language_loss": 0.82653767, + "learning_rate": 1.1240201545553682e-06, + "loss": 0.84858704, + "num_input_tokens_seen": 117222935, + "step": 5447, + "time_per_iteration": 2.41939640045166 + }, + { + "auxiliary_loss_clip": 0.01127309, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.04546309, + "balance_loss_mlp": 1.01740122, + "epoch": 0.6550832682017675, + "flos": 25187313312000.0, + "grad_norm": 1.7592498599445936, + "language_loss": 0.73017538, + "learning_rate": 1.1233199411764987e-06, + "loss": 0.75169563, + "num_input_tokens_seen": 117242370, + "step": 5448, + "time_per_iteration": 2.546719789505005 + }, + { + "auxiliary_loss_clip": 0.01116508, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.01867139, + "epoch": 0.6552035110924067, + "flos": 22748153379840.0, + "grad_norm": 1.7906602316990663, + "language_loss": 0.68703872, + "learning_rate": 1.1226198607892978e-06, + "loss": 0.70846474, + "num_input_tokens_seen": 117262930, + "step": 5449, + "time_per_iteration": 2.534228801727295 + }, + { + "auxiliary_loss_clip": 0.01120039, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.04696107, + "balance_loss_mlp": 1.01824594, + "epoch": 0.6553237539830458, + "flos": 21799178012160.0, + "grad_norm": 1.7359555512365068, + "language_loss": 0.79595459, + "learning_rate": 1.1219199134999664e-06, + "loss": 0.81741381, + "num_input_tokens_seen": 117281430, + "step": 5450, + "time_per_iteration": 2.5475001335144043 + }, + { + "auxiliary_loss_clip": 0.01145622, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.04718912, + "balance_loss_mlp": 1.01979852, + "epoch": 0.6554439968736848, + "flos": 20887226588160.0, + "grad_norm": 2.2396766975448985, + "language_loss": 0.78496528, + "learning_rate": 1.1212200994146863e-06, + "loss": 0.80670428, + "num_input_tokens_seen": 117299185, + "step": 5451, + "time_per_iteration": 2.4714713096618652 + }, + { + "auxiliary_loss_clip": 0.01125096, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.04054236, + "balance_loss_mlp": 1.02187109, + "epoch": 0.655564239764324, + "flos": 16139045698560.0, + "grad_norm": 2.603008830866016, + "language_loss": 0.75873518, + "learning_rate": 1.120520418639618e-06, + "loss": 0.78028047, + "num_input_tokens_seen": 117317720, + "step": 5452, + "time_per_iteration": 2.5121986865997314 + }, + { + "auxiliary_loss_clip": 0.01157432, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.05039299, + "balance_loss_mlp": 1.02167535, + "epoch": 0.655684482654963, + "flos": 29570354496000.0, + "grad_norm": 1.9105713941414475, + "language_loss": 0.83696365, + "learning_rate": 1.119820871280903e-06, + "loss": 0.85882533, + "num_input_tokens_seen": 117338795, + "step": 5453, + "time_per_iteration": 2.527688503265381 + }, + { + "auxiliary_loss_clip": 0.01155859, + "auxiliary_loss_mlp": 0.01024572, + "balance_loss_clip": 1.04797316, + "balance_loss_mlp": 1.01706171, + "epoch": 0.6558047255456021, + "flos": 29789409588480.0, + "grad_norm": 4.816395132912529, + "language_loss": 0.73438352, + "learning_rate": 1.1191214574446614e-06, + "loss": 0.75618786, + "num_input_tokens_seen": 117359040, + "step": 5454, + "time_per_iteration": 2.5190632343292236 + }, + { + "auxiliary_loss_clip": 0.01137683, + "auxiliary_loss_mlp": 0.01026661, + "balance_loss_clip": 1.04529715, + "balance_loss_mlp": 1.01910353, + "epoch": 0.6559249684362413, + "flos": 29059166090880.0, + "grad_norm": 2.3987333741800696, + "language_loss": 0.79803085, + "learning_rate": 1.118422177236995e-06, + "loss": 0.81967425, + "num_input_tokens_seen": 117380865, + "step": 5455, + "time_per_iteration": 2.5466363430023193 + }, + { + "auxiliary_loss_clip": 0.0114459, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.04652262, + "balance_loss_mlp": 1.02335334, + "epoch": 0.6560452113268803, + "flos": 20225464369920.0, + "grad_norm": 1.943398448947495, + "language_loss": 0.85624105, + "learning_rate": 1.1177230307639835e-06, + "loss": 0.87800193, + "num_input_tokens_seen": 117398405, + "step": 5456, + "time_per_iteration": 3.2690234184265137 + }, + { + "auxiliary_loss_clip": 0.01124994, + "auxiliary_loss_mlp": 0.01025436, + "balance_loss_clip": 1.04343605, + "balance_loss_mlp": 1.01791942, + "epoch": 0.6561654542175194, + "flos": 25045538330880.0, + "grad_norm": 1.663899665356716, + "language_loss": 0.7848624, + "learning_rate": 1.1170240181316865e-06, + "loss": 0.80636674, + "num_input_tokens_seen": 117419850, + "step": 5457, + "time_per_iteration": 2.5490152835845947 + }, + { + "auxiliary_loss_clip": 0.01124641, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.04216897, + "balance_loss_mlp": 1.0201993, + "epoch": 0.6562856971081584, + "flos": 22856711258880.0, + "grad_norm": 2.726734622191998, + "language_loss": 0.79474413, + "learning_rate": 1.1163251394461442e-06, + "loss": 0.81627089, + "num_input_tokens_seen": 117438330, + "step": 5458, + "time_per_iteration": 2.506521701812744 + }, + { + "auxiliary_loss_clip": 0.01154774, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.04819155, + "balance_loss_mlp": 1.01946175, + "epoch": 0.6564059399987976, + "flos": 18872565586560.0, + "grad_norm": 1.91155008434475, + "language_loss": 0.82672256, + "learning_rate": 1.1156263948133746e-06, + "loss": 0.84853876, + "num_input_tokens_seen": 117454985, + "step": 5459, + "time_per_iteration": 3.284507989883423 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.00762642, + "balance_loss_clip": 1.04414308, + "balance_loss_mlp": 1.00072885, + "epoch": 0.6565261828894366, + "flos": 25484187219840.0, + "grad_norm": 2.238904793705515, + "language_loss": 0.77235126, + "learning_rate": 1.1149277843393787e-06, + "loss": 0.79106021, + "num_input_tokens_seen": 117476145, + "step": 5460, + "time_per_iteration": 3.3520314693450928 + }, + { + "auxiliary_loss_clip": 0.01095583, + "auxiliary_loss_mlp": 0.00762707, + "balance_loss_clip": 1.0371139, + "balance_loss_mlp": 1.00071025, + "epoch": 0.6566464257800757, + "flos": 19683500987520.0, + "grad_norm": 2.47829113441089, + "language_loss": 0.63581449, + "learning_rate": 1.1142293081301342e-06, + "loss": 0.65439737, + "num_input_tokens_seen": 117494025, + "step": 5461, + "time_per_iteration": 2.589726686477661 + }, + { + "auxiliary_loss_clip": 0.01138806, + "auxiliary_loss_mlp": 0.01020857, + "balance_loss_clip": 1.04576254, + "balance_loss_mlp": 1.01400793, + "epoch": 0.6567666686707149, + "flos": 23514127931520.0, + "grad_norm": 1.7557402581691772, + "language_loss": 0.68078631, + "learning_rate": 1.1135309662915995e-06, + "loss": 0.70238292, + "num_input_tokens_seen": 117514190, + "step": 5462, + "time_per_iteration": 3.3392550945281982 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01025404, + "balance_loss_clip": 1.04316413, + "balance_loss_mlp": 1.01804876, + "epoch": 0.6568869115613539, + "flos": 32781342896640.0, + "grad_norm": 2.0053983555877184, + "language_loss": 0.60525131, + "learning_rate": 1.112832758929712e-06, + "loss": 0.62671912, + "num_input_tokens_seen": 117536800, + "step": 5463, + "time_per_iteration": 2.649257183074951 + }, + { + "auxiliary_loss_clip": 0.01155649, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.04928041, + "balance_loss_mlp": 1.02368033, + "epoch": 0.657007154451993, + "flos": 18442428220800.0, + "grad_norm": 1.8745029126214443, + "language_loss": 0.74964637, + "learning_rate": 1.11213468615039e-06, + "loss": 0.77151465, + "num_input_tokens_seen": 117556230, + "step": 5464, + "time_per_iteration": 2.450623035430908 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01026109, + "balance_loss_clip": 1.04264569, + "balance_loss_mlp": 1.01894748, + "epoch": 0.6571273973426321, + "flos": 25156717902720.0, + "grad_norm": 1.4900733433035265, + "language_loss": 0.75171518, + "learning_rate": 1.1114367480595292e-06, + "loss": 0.77298307, + "num_input_tokens_seen": 117577310, + "step": 5465, + "time_per_iteration": 2.615804433822632 + }, + { + "auxiliary_loss_clip": 0.01101774, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.04615283, + "balance_loss_mlp": 1.02239525, + "epoch": 0.6572476402332712, + "flos": 17529830352000.0, + "grad_norm": 1.7735486969208059, + "language_loss": 0.81244195, + "learning_rate": 1.1107389447630086e-06, + "loss": 0.8337633, + "num_input_tokens_seen": 117596010, + "step": 5466, + "time_per_iteration": 2.557216167449951 + }, + { + "auxiliary_loss_clip": 0.01137687, + "auxiliary_loss_mlp": 0.00761851, + "balance_loss_clip": 1.04379416, + "balance_loss_mlp": 1.00078666, + "epoch": 0.6573678831239103, + "flos": 17014260487680.0, + "grad_norm": 1.9922111222417023, + "language_loss": 0.78405643, + "learning_rate": 1.1100412763666818e-06, + "loss": 0.80305183, + "num_input_tokens_seen": 117611270, + "step": 5467, + "time_per_iteration": 2.4717767238616943 + }, + { + "auxiliary_loss_clip": 0.01144888, + "auxiliary_loss_mlp": 0.01023207, + "balance_loss_clip": 1.04768229, + "balance_loss_mlp": 1.01533329, + "epoch": 0.6574881260145494, + "flos": 23910078528000.0, + "grad_norm": 1.4934403424944827, + "language_loss": 0.80037481, + "learning_rate": 1.1093437429763865e-06, + "loss": 0.82205576, + "num_input_tokens_seen": 117631535, + "step": 5468, + "time_per_iteration": 2.519512414932251 + }, + { + "auxiliary_loss_clip": 0.01157556, + "auxiliary_loss_mlp": 0.01019966, + "balance_loss_clip": 1.0499711, + "balance_loss_mlp": 1.01305163, + "epoch": 0.6576083689051885, + "flos": 11218458504960.0, + "grad_norm": 2.173774977131815, + "language_loss": 0.73243797, + "learning_rate": 1.1086463446979361e-06, + "loss": 0.75421321, + "num_input_tokens_seen": 117649885, + "step": 5469, + "time_per_iteration": 2.473111629486084 + }, + { + "auxiliary_loss_clip": 0.01161031, + "auxiliary_loss_mlp": 0.01024323, + "balance_loss_clip": 1.05172396, + "balance_loss_mlp": 1.01703048, + "epoch": 0.6577286117958275, + "flos": 22455553190400.0, + "grad_norm": 1.756797979653362, + "language_loss": 0.77532804, + "learning_rate": 1.1079490816371277e-06, + "loss": 0.79718161, + "num_input_tokens_seen": 117669650, + "step": 5470, + "time_per_iteration": 2.466010808944702 + }, + { + "auxiliary_loss_clip": 0.01158691, + "auxiliary_loss_mlp": 0.00762291, + "balance_loss_clip": 1.04814124, + "balance_loss_mlp": 1.00074315, + "epoch": 0.6578488546864667, + "flos": 21872184405120.0, + "grad_norm": 2.3502520897543393, + "language_loss": 0.75057077, + "learning_rate": 1.1072519538997352e-06, + "loss": 0.76978064, + "num_input_tokens_seen": 117688790, + "step": 5471, + "time_per_iteration": 2.4627366065979004 + }, + { + "auxiliary_loss_clip": 0.01144903, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.04452133, + "balance_loss_mlp": 1.01660573, + "epoch": 0.6579690975771058, + "flos": 23543753673600.0, + "grad_norm": 1.744828416464258, + "language_loss": 0.82241738, + "learning_rate": 1.1065549615915095e-06, + "loss": 0.84410429, + "num_input_tokens_seen": 117708620, + "step": 5472, + "time_per_iteration": 2.516986608505249 + }, + { + "auxiliary_loss_clip": 0.01161106, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.0532831, + "balance_loss_mlp": 1.0206176, + "epoch": 0.6580893404677448, + "flos": 32743995730560.0, + "grad_norm": 2.1604951934973347, + "language_loss": 0.78304708, + "learning_rate": 1.105858104818187e-06, + "loss": 0.80494142, + "num_input_tokens_seen": 117729775, + "step": 5473, + "time_per_iteration": 2.5387651920318604 + }, + { + "auxiliary_loss_clip": 0.01161532, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.04977083, + "balance_loss_mlp": 1.01672578, + "epoch": 0.658209583358384, + "flos": 15888138220800.0, + "grad_norm": 6.193159378752744, + "language_loss": 0.74840915, + "learning_rate": 1.105161383685478e-06, + "loss": 0.77027452, + "num_input_tokens_seen": 117746160, + "step": 5474, + "time_per_iteration": 2.4260239601135254 + }, + { + "auxiliary_loss_clip": 0.01042022, + "auxiliary_loss_mlp": 0.01001845, + "balance_loss_clip": 1.01838005, + "balance_loss_mlp": 1.00092697, + "epoch": 0.658329826249023, + "flos": 62695902447360.0, + "grad_norm": 0.7297184140266282, + "language_loss": 0.56311542, + "learning_rate": 1.1044647982990771e-06, + "loss": 0.58355409, + "num_input_tokens_seen": 117808045, + "step": 5475, + "time_per_iteration": 3.0598082542419434 + }, + { + "auxiliary_loss_clip": 0.01145704, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.04793811, + "balance_loss_mlp": 1.01879227, + "epoch": 0.6584500691396621, + "flos": 31722624501120.0, + "grad_norm": 2.511164672358488, + "language_loss": 0.64645272, + "learning_rate": 1.1037683487646536e-06, + "loss": 0.66817558, + "num_input_tokens_seen": 117828330, + "step": 5476, + "time_per_iteration": 2.5600411891937256 + }, + { + "auxiliary_loss_clip": 0.01142695, + "auxiliary_loss_mlp": 0.00762484, + "balance_loss_clip": 1.04982102, + "balance_loss_mlp": 1.00066936, + "epoch": 0.6585703120303013, + "flos": 18406086635520.0, + "grad_norm": 2.104784881122586, + "language_loss": 0.77099991, + "learning_rate": 1.1030720351878583e-06, + "loss": 0.79005164, + "num_input_tokens_seen": 117846450, + "step": 5477, + "time_per_iteration": 2.44301700592041 + }, + { + "auxiliary_loss_clip": 0.01054978, + "auxiliary_loss_mlp": 0.01001845, + "balance_loss_clip": 1.01835775, + "balance_loss_mlp": 1.00098693, + "epoch": 0.6586905549209403, + "flos": 58309880434560.0, + "grad_norm": 0.8136869411179377, + "language_loss": 0.57718468, + "learning_rate": 1.102375857674323e-06, + "loss": 0.59775293, + "num_input_tokens_seen": 117908365, + "step": 5478, + "time_per_iteration": 3.032545804977417 + }, + { + "auxiliary_loss_clip": 0.01143578, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.04626369, + "balance_loss_mlp": 1.01717186, + "epoch": 0.6588107978115794, + "flos": 22782627457920.0, + "grad_norm": 1.7751988157288778, + "language_loss": 0.9038468, + "learning_rate": 1.1016798163296561e-06, + "loss": 0.92552835, + "num_input_tokens_seen": 117927565, + "step": 5479, + "time_per_iteration": 2.505192756652832 + }, + { + "auxiliary_loss_clip": 0.01160699, + "auxiliary_loss_mlp": 0.01021794, + "balance_loss_clip": 1.0491817, + "balance_loss_mlp": 1.01402748, + "epoch": 0.6589310407022185, + "flos": 20667525050880.0, + "grad_norm": 1.8999939411733766, + "language_loss": 0.66394627, + "learning_rate": 1.1009839112594471e-06, + "loss": 0.68577117, + "num_input_tokens_seen": 117945590, + "step": 5480, + "time_per_iteration": 2.445777177810669 + }, + { + "auxiliary_loss_clip": 0.01160084, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.04912531, + "balance_loss_mlp": 1.02225971, + "epoch": 0.6590512835928576, + "flos": 25630595055360.0, + "grad_norm": 2.1081587316969688, + "language_loss": 0.71923256, + "learning_rate": 1.1002881425692638e-06, + "loss": 0.74113327, + "num_input_tokens_seen": 117966020, + "step": 5481, + "time_per_iteration": 2.4979910850524902 + }, + { + "auxiliary_loss_clip": 0.01151807, + "auxiliary_loss_mlp": 0.0102442, + "balance_loss_clip": 1.04621863, + "balance_loss_mlp": 1.01660919, + "epoch": 0.6591715264834966, + "flos": 23726108044800.0, + "grad_norm": 1.6499540453889339, + "language_loss": 0.75149906, + "learning_rate": 1.0995925103646532e-06, + "loss": 0.77326131, + "num_input_tokens_seen": 117984620, + "step": 5482, + "time_per_iteration": 3.2330353260040283 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.0102355, + "balance_loss_clip": 1.0471046, + "balance_loss_mlp": 1.01577759, + "epoch": 0.6592917693741358, + "flos": 35773850822400.0, + "grad_norm": 1.7927925427408702, + "language_loss": 0.66550684, + "learning_rate": 1.0988970147511437e-06, + "loss": 0.68700463, + "num_input_tokens_seen": 118006500, + "step": 5483, + "time_per_iteration": 2.6555371284484863 + }, + { + "auxiliary_loss_clip": 0.01144911, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.04939961, + "balance_loss_mlp": 1.01949286, + "epoch": 0.6594120122647749, + "flos": 21396834794880.0, + "grad_norm": 5.828404758710859, + "language_loss": 0.80168962, + "learning_rate": 1.0982016558342405e-06, + "loss": 0.82341182, + "num_input_tokens_seen": 118025470, + "step": 5484, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.01173003, + "auxiliary_loss_mlp": 0.01022017, + "balance_loss_clip": 1.05158246, + "balance_loss_mlp": 1.01477814, + "epoch": 0.6595322551554139, + "flos": 19351829779200.0, + "grad_norm": 2.092667029019394, + "language_loss": 0.71033549, + "learning_rate": 1.0975064337194291e-06, + "loss": 0.73228574, + "num_input_tokens_seen": 118043515, + "step": 5485, + "time_per_iteration": 2.4008774757385254 + }, + { + "auxiliary_loss_clip": 0.01127299, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.0471586, + "balance_loss_mlp": 1.0298183, + "epoch": 0.6596524980460531, + "flos": 16837113588480.0, + "grad_norm": 2.6069982098550684, + "language_loss": 0.7033574, + "learning_rate": 1.0968113485121743e-06, + "loss": 0.72500384, + "num_input_tokens_seen": 118063105, + "step": 5486, + "time_per_iteration": 3.3639791011810303 + }, + { + "auxiliary_loss_clip": 0.01157822, + "auxiliary_loss_mlp": 0.0076278, + "balance_loss_clip": 1.04614782, + "balance_loss_mlp": 1.00071931, + "epoch": 0.6597727409366921, + "flos": 21798567480960.0, + "grad_norm": 1.727159501699965, + "language_loss": 0.8022033, + "learning_rate": 1.0961164003179185e-06, + "loss": 0.82140934, + "num_input_tokens_seen": 118081615, + "step": 5487, + "time_per_iteration": 3.3240482807159424 + }, + { + "auxiliary_loss_clip": 0.01128229, + "auxiliary_loss_mlp": 0.01026839, + "balance_loss_clip": 1.04485822, + "balance_loss_mlp": 1.01906073, + "epoch": 0.6598929838273312, + "flos": 23730704985600.0, + "grad_norm": 1.7684741516972529, + "language_loss": 0.84352738, + "learning_rate": 1.0954215892420884e-06, + "loss": 0.86507809, + "num_input_tokens_seen": 118102315, + "step": 5488, + "time_per_iteration": 2.5419845581054688 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.04790759, + "balance_loss_mlp": 1.02376807, + "epoch": 0.6600132267179702, + "flos": 19974520978560.0, + "grad_norm": 1.7655967211589083, + "language_loss": 0.70552593, + "learning_rate": 1.094726915390082e-06, + "loss": 0.727189, + "num_input_tokens_seen": 118120650, + "step": 5489, + "time_per_iteration": 3.272987127304077 + }, + { + "auxiliary_loss_clip": 0.01159659, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.04994094, + "balance_loss_mlp": 1.01894069, + "epoch": 0.6601334696086094, + "flos": 22342649765760.0, + "grad_norm": 1.7514712984569456, + "language_loss": 0.69936025, + "learning_rate": 1.0940323788672836e-06, + "loss": 0.72122204, + "num_input_tokens_seen": 118139825, + "step": 5490, + "time_per_iteration": 2.4691755771636963 + }, + { + "auxiliary_loss_clip": 0.01154527, + "auxiliary_loss_mlp": 0.01023903, + "balance_loss_clip": 1.04891181, + "balance_loss_mlp": 1.0162468, + "epoch": 0.6602537124992485, + "flos": 25703098657920.0, + "grad_norm": 1.5914370212075002, + "language_loss": 0.73699164, + "learning_rate": 1.0933379797790522e-06, + "loss": 0.75877589, + "num_input_tokens_seen": 118159240, + "step": 5491, + "time_per_iteration": 2.4835219383239746 + }, + { + "auxiliary_loss_clip": 0.0117339, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.0516032, + "balance_loss_mlp": 1.01980305, + "epoch": 0.6603739553898875, + "flos": 25848572739840.0, + "grad_norm": 2.181969012948015, + "language_loss": 0.7085371, + "learning_rate": 1.0926437182307293e-06, + "loss": 0.73054838, + "num_input_tokens_seen": 118178050, + "step": 5492, + "time_per_iteration": 2.4552559852600098 + }, + { + "auxiliary_loss_clip": 0.01147338, + "auxiliary_loss_mlp": 0.01025934, + "balance_loss_clip": 1.0467391, + "balance_loss_mlp": 1.0182209, + "epoch": 0.6604941982805267, + "flos": 24570296461440.0, + "grad_norm": 1.7549829047717922, + "language_loss": 0.77862883, + "learning_rate": 1.0919495943276338e-06, + "loss": 0.80036157, + "num_input_tokens_seen": 118199070, + "step": 5493, + "time_per_iteration": 2.5302581787109375 + }, + { + "auxiliary_loss_clip": 0.01131722, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.04236674, + "balance_loss_mlp": 1.01735234, + "epoch": 0.6606144411711657, + "flos": 13261775581440.0, + "grad_norm": 3.1017645903864124, + "language_loss": 0.7631433, + "learning_rate": 1.0912556081750611e-06, + "loss": 0.7847172, + "num_input_tokens_seen": 118217000, + "step": 5494, + "time_per_iteration": 2.5310869216918945 + }, + { + "auxiliary_loss_clip": 0.01142509, + "auxiliary_loss_mlp": 0.01026035, + "balance_loss_clip": 1.04902244, + "balance_loss_mlp": 1.01875758, + "epoch": 0.6607346840618048, + "flos": 25155281358720.0, + "grad_norm": 1.9716868875742357, + "language_loss": 0.76578814, + "learning_rate": 1.0905617598782909e-06, + "loss": 0.78747356, + "num_input_tokens_seen": 118237205, + "step": 5495, + "time_per_iteration": 2.502443313598633 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01026434, + "balance_loss_clip": 1.04394794, + "balance_loss_mlp": 1.01927519, + "epoch": 0.660854926952444, + "flos": 17638029095040.0, + "grad_norm": 3.1336762055953695, + "language_loss": 0.80862933, + "learning_rate": 1.0898680495425775e-06, + "loss": 0.82998168, + "num_input_tokens_seen": 118255495, + "step": 5496, + "time_per_iteration": 2.5191776752471924 + }, + { + "auxiliary_loss_clip": 0.01147984, + "auxiliary_loss_mlp": 0.01027016, + "balance_loss_clip": 1.04920626, + "balance_loss_mlp": 1.01960707, + "epoch": 0.660975169843083, + "flos": 16836000266880.0, + "grad_norm": 1.627412866153545, + "language_loss": 0.80131406, + "learning_rate": 1.0891744772731594e-06, + "loss": 0.82306409, + "num_input_tokens_seen": 118273310, + "step": 5497, + "time_per_iteration": 2.4517807960510254 + }, + { + "auxiliary_loss_clip": 0.01159899, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.0488503, + "balance_loss_mlp": 1.02308798, + "epoch": 0.6610954127337221, + "flos": 26870410846080.0, + "grad_norm": 1.5313410030880796, + "language_loss": 0.66256523, + "learning_rate": 1.088481043175248e-06, + "loss": 0.68446678, + "num_input_tokens_seen": 118293880, + "step": 5498, + "time_per_iteration": 2.50740385055542 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01023767, + "balance_loss_clip": 1.04173291, + "balance_loss_mlp": 1.0161705, + "epoch": 0.6612156556243612, + "flos": 26465697331200.0, + "grad_norm": 1.5800179556884986, + "language_loss": 0.75429577, + "learning_rate": 1.0877877473540368e-06, + "loss": 0.77583969, + "num_input_tokens_seen": 118314465, + "step": 5499, + "time_per_iteration": 2.5167829990386963 + }, + { + "auxiliary_loss_clip": 0.01173366, + "auxiliary_loss_mlp": 0.01021604, + "balance_loss_clip": 1.04949057, + "balance_loss_mlp": 1.01420736, + "epoch": 0.6613358985150003, + "flos": 19791915212160.0, + "grad_norm": 1.695776739112023, + "language_loss": 0.72491169, + "learning_rate": 1.0870945899147002e-06, + "loss": 0.74686146, + "num_input_tokens_seen": 118331110, + "step": 5500, + "time_per_iteration": 2.3996667861938477 + }, + { + "auxiliary_loss_clip": 0.01154867, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.04924369, + "balance_loss_mlp": 1.02043664, + "epoch": 0.6614561414056394, + "flos": 26831627136000.0, + "grad_norm": 1.782129191087351, + "language_loss": 0.76254296, + "learning_rate": 1.0864015709623879e-06, + "loss": 0.78436768, + "num_input_tokens_seen": 118351980, + "step": 5501, + "time_per_iteration": 2.4809956550598145 + }, + { + "auxiliary_loss_clip": 0.01160495, + "auxiliary_loss_mlp": 0.01025799, + "balance_loss_clip": 1.04785395, + "balance_loss_mlp": 1.01853371, + "epoch": 0.6615763842962785, + "flos": 22894597128960.0, + "grad_norm": 2.2878075119431918, + "language_loss": 0.80370712, + "learning_rate": 1.0857086906022313e-06, + "loss": 0.82557011, + "num_input_tokens_seen": 118370315, + "step": 5502, + "time_per_iteration": 2.4603171348571777 + }, + { + "auxiliary_loss_clip": 0.01092524, + "auxiliary_loss_mlp": 0.01024673, + "balance_loss_clip": 1.04411626, + "balance_loss_mlp": 1.01672781, + "epoch": 0.6616966271869176, + "flos": 24790321221120.0, + "grad_norm": 1.9567973686304647, + "language_loss": 0.73015428, + "learning_rate": 1.0850159489393388e-06, + "loss": 0.75132626, + "num_input_tokens_seen": 118389575, + "step": 5503, + "time_per_iteration": 2.591839551925659 + }, + { + "auxiliary_loss_clip": 0.01120105, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.04040778, + "balance_loss_mlp": 1.01731658, + "epoch": 0.6618168700775566, + "flos": 17202109639680.0, + "grad_norm": 1.7726425937653567, + "language_loss": 0.8221457, + "learning_rate": 1.0843233460787992e-06, + "loss": 0.84359682, + "num_input_tokens_seen": 118406790, + "step": 5504, + "time_per_iteration": 2.5015242099761963 + }, + { + "auxiliary_loss_clip": 0.01121837, + "auxiliary_loss_mlp": 0.01025203, + "balance_loss_clip": 1.04845929, + "balance_loss_mlp": 1.01745415, + "epoch": 0.6619371129681958, + "flos": 25447091448960.0, + "grad_norm": 1.817277690297216, + "language_loss": 0.77930063, + "learning_rate": 1.0836308821256805e-06, + "loss": 0.800771, + "num_input_tokens_seen": 118427590, + "step": 5505, + "time_per_iteration": 2.550518751144409 + }, + { + "auxiliary_loss_clip": 0.01156609, + "auxiliary_loss_mlp": 0.01026321, + "balance_loss_clip": 1.04958797, + "balance_loss_mlp": 1.01912665, + "epoch": 0.6620573558588349, + "flos": 18040444139520.0, + "grad_norm": 2.0885954603458066, + "language_loss": 0.78144073, + "learning_rate": 1.0829385571850282e-06, + "loss": 0.80327004, + "num_input_tokens_seen": 118444570, + "step": 5506, + "time_per_iteration": 2.432097911834717 + }, + { + "auxiliary_loss_clip": 0.01175803, + "auxiliary_loss_mlp": 0.01022833, + "balance_loss_clip": 1.05067146, + "balance_loss_mlp": 1.014727, + "epoch": 0.6621775987494739, + "flos": 17785586165760.0, + "grad_norm": 2.6374579240350045, + "language_loss": 0.83635974, + "learning_rate": 1.0822463713618679e-06, + "loss": 0.85834622, + "num_input_tokens_seen": 118461425, + "step": 5507, + "time_per_iteration": 2.390392541885376 + }, + { + "auxiliary_loss_clip": 0.01129776, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.0454073, + "balance_loss_mlp": 1.02062166, + "epoch": 0.6622978416401131, + "flos": 17492590926720.0, + "grad_norm": 2.0372885234616733, + "language_loss": 0.84817189, + "learning_rate": 1.0815543247612034e-06, + "loss": 0.86974925, + "num_input_tokens_seen": 118478495, + "step": 5508, + "time_per_iteration": 2.498204231262207 + }, + { + "auxiliary_loss_clip": 0.01139714, + "auxiliary_loss_mlp": 0.01020393, + "balance_loss_clip": 1.04204845, + "balance_loss_mlp": 1.01306212, + "epoch": 0.6624180845307521, + "flos": 21648352803840.0, + "grad_norm": 1.6316674313800148, + "language_loss": 0.82571679, + "learning_rate": 1.0808624174880168e-06, + "loss": 0.84731787, + "num_input_tokens_seen": 118499145, + "step": 5509, + "time_per_iteration": 3.2900233268737793 + }, + { + "auxiliary_loss_clip": 0.01170201, + "auxiliary_loss_mlp": 0.01022683, + "balance_loss_clip": 1.05057073, + "balance_loss_mlp": 1.01573944, + "epoch": 0.6625383274213912, + "flos": 23805902108160.0, + "grad_norm": 1.681469688867829, + "language_loss": 0.79641908, + "learning_rate": 1.080170649647272e-06, + "loss": 0.81834799, + "num_input_tokens_seen": 118518950, + "step": 5510, + "time_per_iteration": 2.4409148693084717 + }, + { + "auxiliary_loss_clip": 0.01169568, + "auxiliary_loss_mlp": 0.01022581, + "balance_loss_clip": 1.04933012, + "balance_loss_mlp": 1.01496696, + "epoch": 0.6626585703120303, + "flos": 33262941473280.0, + "grad_norm": 1.5850588106199062, + "language_loss": 0.67071486, + "learning_rate": 1.0794790213439068e-06, + "loss": 0.69263631, + "num_input_tokens_seen": 118545850, + "step": 5511, + "time_per_iteration": 2.583749294281006 + }, + { + "auxiliary_loss_clip": 0.01117278, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.04536104, + "balance_loss_mlp": 1.0209192, + "epoch": 0.6627788132026694, + "flos": 22085780630400.0, + "grad_norm": 2.1677491242586373, + "language_loss": 0.78728771, + "learning_rate": 1.078787532682843e-06, + "loss": 0.80875069, + "num_input_tokens_seen": 118563325, + "step": 5512, + "time_per_iteration": 3.5486109256744385 + }, + { + "auxiliary_loss_clip": 0.01153587, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.04797149, + "balance_loss_mlp": 1.02104831, + "epoch": 0.6628990560933085, + "flos": 36173608260480.0, + "grad_norm": 2.457118403339589, + "language_loss": 0.7613343, + "learning_rate": 1.0780961837689773e-06, + "loss": 0.78315347, + "num_input_tokens_seen": 118582835, + "step": 5513, + "time_per_iteration": 3.4394383430480957 + }, + { + "auxiliary_loss_clip": 0.01137165, + "auxiliary_loss_mlp": 0.010237, + "balance_loss_clip": 1.0473299, + "balance_loss_mlp": 1.0163449, + "epoch": 0.6630192989839476, + "flos": 18513567106560.0, + "grad_norm": 1.8659128961843627, + "language_loss": 0.69941652, + "learning_rate": 1.0774049747071883e-06, + "loss": 0.72102517, + "num_input_tokens_seen": 118600715, + "step": 5514, + "time_per_iteration": 2.46176815032959 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 1.02205515, + "epoch": 0.6631395418745867, + "flos": 35809510049280.0, + "grad_norm": 1.786747177903583, + "language_loss": 0.68000865, + "learning_rate": 1.076713905602332e-06, + "loss": 0.70141411, + "num_input_tokens_seen": 118621290, + "step": 5515, + "time_per_iteration": 2.6638760566711426 + }, + { + "auxiliary_loss_clip": 0.01160182, + "auxiliary_loss_mlp": 0.01022697, + "balance_loss_clip": 1.04950476, + "balance_loss_mlp": 1.01544881, + "epoch": 0.6632597847652257, + "flos": 20047742853120.0, + "grad_norm": 1.825025368132173, + "language_loss": 0.81068814, + "learning_rate": 1.07602297655924e-06, + "loss": 0.83251691, + "num_input_tokens_seen": 118639610, + "step": 5516, + "time_per_iteration": 3.2048749923706055 + }, + { + "auxiliary_loss_clip": 0.01171914, + "auxiliary_loss_mlp": 0.0102587, + "balance_loss_clip": 1.05165792, + "balance_loss_mlp": 1.01868165, + "epoch": 0.6633800276558649, + "flos": 21214480423680.0, + "grad_norm": 1.8426631964134386, + "language_loss": 0.81222248, + "learning_rate": 1.0753321876827292e-06, + "loss": 0.83420026, + "num_input_tokens_seen": 118658895, + "step": 5517, + "time_per_iteration": 2.432898998260498 + }, + { + "auxiliary_loss_clip": 0.01169314, + "auxiliary_loss_mlp": 0.01023304, + "balance_loss_clip": 1.0471499, + "balance_loss_mlp": 1.01585925, + "epoch": 0.663500270546504, + "flos": 23987753688960.0, + "grad_norm": 1.7610862714714586, + "language_loss": 0.74105859, + "learning_rate": 1.0746415390775893e-06, + "loss": 0.76298475, + "num_input_tokens_seen": 118677025, + "step": 5518, + "time_per_iteration": 2.430189609527588 + }, + { + "auxiliary_loss_clip": 0.01170991, + "auxiliary_loss_mlp": 0.01026131, + "balance_loss_clip": 1.05153191, + "balance_loss_mlp": 1.0189929, + "epoch": 0.663620513437143, + "flos": 17932389050880.0, + "grad_norm": 2.193754933738702, + "language_loss": 0.76538527, + "learning_rate": 1.0739510308485939e-06, + "loss": 0.7873565, + "num_input_tokens_seen": 118694240, + "step": 5519, + "time_per_iteration": 2.3850111961364746 + }, + { + "auxiliary_loss_clip": 0.01046035, + "auxiliary_loss_mlp": 0.01002203, + "balance_loss_clip": 1.01747537, + "balance_loss_mlp": 1.00130308, + "epoch": 0.6637407563277821, + "flos": 57840241086720.0, + "grad_norm": 0.805975172859145, + "language_loss": 0.62494946, + "learning_rate": 1.07326066310049e-06, + "loss": 0.64543176, + "num_input_tokens_seen": 118758365, + "step": 5520, + "time_per_iteration": 3.115778684616089 + }, + { + "auxiliary_loss_clip": 0.01124287, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.04352069, + "balance_loss_mlp": 1.01741815, + "epoch": 0.6638609992184212, + "flos": 27306007079040.0, + "grad_norm": 2.9635734425357914, + "language_loss": 0.7924149, + "learning_rate": 1.0725704359380059e-06, + "loss": 0.81391597, + "num_input_tokens_seen": 118778220, + "step": 5521, + "time_per_iteration": 2.5515520572662354 + }, + { + "auxiliary_loss_clip": 0.01160887, + "auxiliary_loss_mlp": 0.01022939, + "balance_loss_clip": 1.04841661, + "balance_loss_mlp": 1.01603103, + "epoch": 0.6639812421090603, + "flos": 18624854419200.0, + "grad_norm": 1.9282956814426322, + "language_loss": 0.72169173, + "learning_rate": 1.0718803494658497e-06, + "loss": 0.74352998, + "num_input_tokens_seen": 118797110, + "step": 5522, + "time_per_iteration": 2.4254212379455566 + }, + { + "auxiliary_loss_clip": 0.01078893, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04172552, + "balance_loss_mlp": 1.02364695, + "epoch": 0.6641014849996993, + "flos": 15924479806080.0, + "grad_norm": 2.014979946928305, + "language_loss": 0.83849549, + "learning_rate": 1.071190403788707e-06, + "loss": 0.859604, + "num_input_tokens_seen": 118812415, + "step": 5523, + "time_per_iteration": 2.5648577213287354 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.04852486, + "balance_loss_mlp": 1.01884484, + "epoch": 0.6642217278903385, + "flos": 26505486622080.0, + "grad_norm": 1.732655601491865, + "language_loss": 0.75329524, + "learning_rate": 1.0705005990112415e-06, + "loss": 0.77491796, + "num_input_tokens_seen": 118832195, + "step": 5524, + "time_per_iteration": 2.5725104808807373 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.04475117, + "balance_loss_mlp": 1.02262878, + "epoch": 0.6643419707809776, + "flos": 15377308951680.0, + "grad_norm": 3.315039872400583, + "language_loss": 0.7430492, + "learning_rate": 1.0698109352380957e-06, + "loss": 0.76439142, + "num_input_tokens_seen": 118849795, + "step": 5525, + "time_per_iteration": 2.5100502967834473 + }, + { + "auxiliary_loss_clip": 0.01169231, + "auxiliary_loss_mlp": 0.01024484, + "balance_loss_clip": 1.04901254, + "balance_loss_mlp": 1.01753092, + "epoch": 0.6644622136716166, + "flos": 25117610970240.0, + "grad_norm": 2.391484785397507, + "language_loss": 0.7815215, + "learning_rate": 1.0691214125738909e-06, + "loss": 0.80345863, + "num_input_tokens_seen": 118870000, + "step": 5526, + "time_per_iteration": 2.450190305709839 + }, + { + "auxiliary_loss_clip": 0.01070438, + "auxiliary_loss_mlp": 0.01001672, + "balance_loss_clip": 1.01508987, + "balance_loss_mlp": 1.00081372, + "epoch": 0.6645824565622558, + "flos": 66201717680640.0, + "grad_norm": 0.7918490834424107, + "language_loss": 0.57526654, + "learning_rate": 1.0684320311232287e-06, + "loss": 0.59598768, + "num_input_tokens_seen": 118932905, + "step": 5527, + "time_per_iteration": 3.0717484951019287 + }, + { + "auxiliary_loss_clip": 0.01141057, + "auxiliary_loss_mlp": 0.01025754, + "balance_loss_clip": 1.04597092, + "balance_loss_mlp": 1.01792765, + "epoch": 0.6647026994528948, + "flos": 25082131311360.0, + "grad_norm": 1.8932580601366746, + "language_loss": 0.81569481, + "learning_rate": 1.0677427909906865e-06, + "loss": 0.83736289, + "num_input_tokens_seen": 118953355, + "step": 5528, + "time_per_iteration": 2.6524579524993896 + }, + { + "auxiliary_loss_clip": 0.01175936, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.05256224, + "balance_loss_mlp": 1.02252936, + "epoch": 0.6648229423435339, + "flos": 18222187979520.0, + "grad_norm": 1.9618559863124314, + "language_loss": 0.71879601, + "learning_rate": 1.0670536922808216e-06, + "loss": 0.74086094, + "num_input_tokens_seen": 118973480, + "step": 5529, + "time_per_iteration": 2.5966434478759766 + }, + { + "auxiliary_loss_clip": 0.01142242, + "auxiliary_loss_mlp": 0.01025324, + "balance_loss_clip": 1.04695535, + "balance_loss_mlp": 1.01833844, + "epoch": 0.6649431852341731, + "flos": 18296882311680.0, + "grad_norm": 4.6810062473447145, + "language_loss": 0.71760809, + "learning_rate": 1.06636473509817e-06, + "loss": 0.73928374, + "num_input_tokens_seen": 118989860, + "step": 5530, + "time_per_iteration": 2.50557804107666 + }, + { + "auxiliary_loss_clip": 0.01137328, + "auxiliary_loss_mlp": 0.00762909, + "balance_loss_clip": 1.04518366, + "balance_loss_mlp": 1.00060987, + "epoch": 0.6650634281248121, + "flos": 17019575700480.0, + "grad_norm": 2.0045402465359885, + "language_loss": 0.80801779, + "learning_rate": 1.0656759195472447e-06, + "loss": 0.82702017, + "num_input_tokens_seen": 119007150, + "step": 5531, + "time_per_iteration": 2.468345880508423 + }, + { + "auxiliary_loss_clip": 0.01048867, + "auxiliary_loss_mlp": 0.01001513, + "balance_loss_clip": 1.01583886, + "balance_loss_mlp": 1.00061309, + "epoch": 0.6651836710154512, + "flos": 69294810666240.0, + "grad_norm": 0.7688547156161302, + "language_loss": 0.59770262, + "learning_rate": 1.0649872457325414e-06, + "loss": 0.61820644, + "num_input_tokens_seen": 119068435, + "step": 5532, + "time_per_iteration": 3.0359108448028564 + }, + { + "auxiliary_loss_clip": 0.01060317, + "auxiliary_loss_mlp": 0.01001244, + "balance_loss_clip": 1.01387858, + "balance_loss_mlp": 1.00034988, + "epoch": 0.6653039139060903, + "flos": 66883444882560.0, + "grad_norm": 0.8540739284539888, + "language_loss": 0.55148345, + "learning_rate": 1.0642987137585278e-06, + "loss": 0.57209909, + "num_input_tokens_seen": 119127960, + "step": 5533, + "time_per_iteration": 2.9857611656188965 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.04616153, + "balance_loss_mlp": 1.01859641, + "epoch": 0.6654241567967294, + "flos": 21470056669440.0, + "grad_norm": 1.677822983000996, + "language_loss": 0.82282197, + "learning_rate": 1.0636103237296561e-06, + "loss": 0.84449053, + "num_input_tokens_seen": 119146885, + "step": 5534, + "time_per_iteration": 2.504901647567749 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.05058169, + "balance_loss_mlp": 1.02081406, + "epoch": 0.6655443996873684, + "flos": 25119514391040.0, + "grad_norm": 1.7147288675738555, + "language_loss": 0.84097803, + "learning_rate": 1.062922075750353e-06, + "loss": 0.86280644, + "num_input_tokens_seen": 119166900, + "step": 5535, + "time_per_iteration": 3.223299741744995 + }, + { + "auxiliary_loss_clip": 0.0113081, + "auxiliary_loss_mlp": 0.01024059, + "balance_loss_clip": 1.04607654, + "balance_loss_mlp": 1.01684368, + "epoch": 0.6656646425780076, + "flos": 17457326749440.0, + "grad_norm": 1.9615097527664374, + "language_loss": 0.72253799, + "learning_rate": 1.0622339699250267e-06, + "loss": 0.74408662, + "num_input_tokens_seen": 119184820, + "step": 5536, + "time_per_iteration": 2.5011141300201416 + }, + { + "auxiliary_loss_clip": 0.01127225, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.04398274, + "balance_loss_mlp": 1.01616645, + "epoch": 0.6657848854686467, + "flos": 23434190213760.0, + "grad_norm": 1.742560073184748, + "language_loss": 0.79410326, + "learning_rate": 1.0615460063580624e-06, + "loss": 0.81560779, + "num_input_tokens_seen": 119203295, + "step": 5537, + "time_per_iteration": 2.5618491172790527 + }, + { + "auxiliary_loss_clip": 0.01145145, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.04666209, + "balance_loss_mlp": 1.01788604, + "epoch": 0.6659051283592857, + "flos": 11509909459200.0, + "grad_norm": 1.7497521672685548, + "language_loss": 0.73423094, + "learning_rate": 1.060858185153821e-06, + "loss": 0.75592935, + "num_input_tokens_seen": 119221395, + "step": 5538, + "time_per_iteration": 2.4606072902679443 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.01024114, + "balance_loss_clip": 1.04701769, + "balance_loss_mlp": 1.01643658, + "epoch": 0.6660253712499249, + "flos": 20594554571520.0, + "grad_norm": 2.1705685113249564, + "language_loss": 0.76340932, + "learning_rate": 1.0601705064166474e-06, + "loss": 0.78513443, + "num_input_tokens_seen": 119239790, + "step": 5539, + "time_per_iteration": 3.314908981323242 + }, + { + "auxiliary_loss_clip": 0.01140862, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.04999018, + "balance_loss_mlp": 1.01981151, + "epoch": 0.666145614140564, + "flos": 21251504367360.0, + "grad_norm": 1.98334632172986, + "language_loss": 0.7346797, + "learning_rate": 1.0594829702508596e-06, + "loss": 0.75636137, + "num_input_tokens_seen": 119257505, + "step": 5540, + "time_per_iteration": 3.3076515197753906 + }, + { + "auxiliary_loss_clip": 0.01129677, + "auxiliary_loss_mlp": 0.01022363, + "balance_loss_clip": 1.04428756, + "balance_loss_mlp": 1.0152936, + "epoch": 0.666265857031203, + "flos": 33726188200320.0, + "grad_norm": 3.951639745684329, + "language_loss": 0.54971969, + "learning_rate": 1.0587955767607592e-06, + "loss": 0.57124007, + "num_input_tokens_seen": 119279365, + "step": 5541, + "time_per_iteration": 2.6498446464538574 + }, + { + "auxiliary_loss_clip": 0.01171984, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.05083442, + "balance_loss_mlp": 1.01951885, + "epoch": 0.6663860999218422, + "flos": 17456644391040.0, + "grad_norm": 2.1541790337222495, + "language_loss": 0.76832557, + "learning_rate": 1.0581083260506206e-06, + "loss": 0.79031593, + "num_input_tokens_seen": 119296150, + "step": 5542, + "time_per_iteration": 2.4024274349212646 + }, + { + "auxiliary_loss_clip": 0.01140099, + "auxiliary_loss_mlp": 0.01025213, + "balance_loss_clip": 1.04537153, + "balance_loss_mlp": 1.01805437, + "epoch": 0.6665063428124812, + "flos": 17676740977920.0, + "grad_norm": 2.126645932810646, + "language_loss": 0.76756954, + "learning_rate": 1.0574212182246993e-06, + "loss": 0.7892226, + "num_input_tokens_seen": 119314845, + "step": 5543, + "time_per_iteration": 3.2622148990631104 + }, + { + "auxiliary_loss_clip": 0.01146619, + "auxiliary_loss_mlp": 0.01024303, + "balance_loss_clip": 1.04625523, + "balance_loss_mlp": 1.01603603, + "epoch": 0.6666265857031203, + "flos": 27673265687040.0, + "grad_norm": 2.236547372071661, + "language_loss": 0.76049602, + "learning_rate": 1.0567342533872303e-06, + "loss": 0.78220528, + "num_input_tokens_seen": 119334875, + "step": 5544, + "time_per_iteration": 2.5519330501556396 + }, + { + "auxiliary_loss_clip": 0.01144798, + "auxiliary_loss_mlp": 0.01025208, + "balance_loss_clip": 1.04801226, + "balance_loss_mlp": 1.01758456, + "epoch": 0.6667468285937594, + "flos": 25046831220480.0, + "grad_norm": 1.7298219727837763, + "language_loss": 0.81157649, + "learning_rate": 1.0560474316424255e-06, + "loss": 0.83327657, + "num_input_tokens_seen": 119354635, + "step": 5545, + "time_per_iteration": 2.52504563331604 + }, + { + "auxiliary_loss_clip": 0.01142637, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.04534674, + "balance_loss_mlp": 1.0210973, + "epoch": 0.6668670714843985, + "flos": 22780472641920.0, + "grad_norm": 2.8473794769995115, + "language_loss": 0.73465812, + "learning_rate": 1.0553607530944746e-06, + "loss": 0.75637919, + "num_input_tokens_seen": 119372690, + "step": 5546, + "time_per_iteration": 2.498440980911255 + }, + { + "auxiliary_loss_clip": 0.0112748, + "auxiliary_loss_mlp": 0.01027001, + "balance_loss_clip": 1.04397225, + "balance_loss_mlp": 1.01971757, + "epoch": 0.6669873143750376, + "flos": 22163886754560.0, + "grad_norm": 2.2452083971023606, + "language_loss": 0.89550394, + "learning_rate": 1.0546742178475463e-06, + "loss": 0.91704875, + "num_input_tokens_seen": 119391685, + "step": 5547, + "time_per_iteration": 2.5326223373413086 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.0102169, + "balance_loss_clip": 1.04582059, + "balance_loss_mlp": 1.0146569, + "epoch": 0.6671075572656767, + "flos": 20514832335360.0, + "grad_norm": 1.714228263651133, + "language_loss": 0.86919343, + "learning_rate": 1.0539878260057868e-06, + "loss": 0.89060658, + "num_input_tokens_seen": 119410725, + "step": 5548, + "time_per_iteration": 2.643723487854004 + }, + { + "auxiliary_loss_clip": 0.01161279, + "auxiliary_loss_mlp": 0.01023832, + "balance_loss_clip": 1.05214167, + "balance_loss_mlp": 1.0153923, + "epoch": 0.6672278001563158, + "flos": 17931203902080.0, + "grad_norm": 2.4429975224497356, + "language_loss": 0.68255234, + "learning_rate": 1.0533015776733226e-06, + "loss": 0.70440346, + "num_input_tokens_seen": 119426875, + "step": 5549, + "time_per_iteration": 2.5756430625915527 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_clip": 1.04726541, + "balance_loss_mlp": 1.0162375, + "epoch": 0.6673480430469548, + "flos": 22342146975360.0, + "grad_norm": 2.388490199378825, + "language_loss": 0.78299356, + "learning_rate": 1.0526154729542566e-06, + "loss": 0.80464613, + "num_input_tokens_seen": 119446935, + "step": 5550, + "time_per_iteration": 2.6153078079223633 + }, + { + "auxiliary_loss_clip": 0.01128532, + "auxiliary_loss_mlp": 0.0102835, + "balance_loss_clip": 1.04624462, + "balance_loss_mlp": 1.02025509, + "epoch": 0.6674682859375939, + "flos": 20703830722560.0, + "grad_norm": 2.2910147188210037, + "language_loss": 0.79933047, + "learning_rate": 1.0519295119526699e-06, + "loss": 0.82089937, + "num_input_tokens_seen": 119463240, + "step": 5551, + "time_per_iteration": 2.6724295616149902 + }, + { + "auxiliary_loss_clip": 0.01145757, + "auxiliary_loss_mlp": 0.01021979, + "balance_loss_clip": 1.04754233, + "balance_loss_mlp": 1.01433754, + "epoch": 0.667588528828233, + "flos": 26206673379840.0, + "grad_norm": 1.5638411910386156, + "language_loss": 0.83154821, + "learning_rate": 1.0512436947726227e-06, + "loss": 0.85322547, + "num_input_tokens_seen": 119484655, + "step": 5552, + "time_per_iteration": 2.6712429523468018 + }, + { + "auxiliary_loss_clip": 0.01127753, + "auxiliary_loss_mlp": 0.01019472, + "balance_loss_clip": 1.0423404, + "balance_loss_mlp": 1.01151538, + "epoch": 0.6677087717188721, + "flos": 23071025756160.0, + "grad_norm": 2.159858252587647, + "language_loss": 0.65541106, + "learning_rate": 1.0505580215181517e-06, + "loss": 0.67688334, + "num_input_tokens_seen": 119502895, + "step": 5553, + "time_per_iteration": 2.5411529541015625 + }, + { + "auxiliary_loss_clip": 0.01028389, + "auxiliary_loss_mlp": 0.01000971, + "balance_loss_clip": 1.01356769, + "balance_loss_mlp": 1.00015473, + "epoch": 0.6678290146095112, + "flos": 70941315219840.0, + "grad_norm": 0.9283409737524261, + "language_loss": 0.56688583, + "learning_rate": 1.0498724922932753e-06, + "loss": 0.58717942, + "num_input_tokens_seen": 119561010, + "step": 5554, + "time_per_iteration": 3.0315301418304443 + }, + { + "auxiliary_loss_clip": 0.01176606, + "auxiliary_loss_mlp": 0.01025569, + "balance_loss_clip": 1.05287588, + "balance_loss_mlp": 1.0175705, + "epoch": 0.6679492575001503, + "flos": 18661088263680.0, + "grad_norm": 2.0944287112077267, + "language_loss": 0.86728042, + "learning_rate": 1.0491871072019851e-06, + "loss": 0.88930219, + "num_input_tokens_seen": 119578900, + "step": 5555, + "time_per_iteration": 2.3964390754699707 + }, + { + "auxiliary_loss_clip": 0.01133213, + "auxiliary_loss_mlp": 0.01027566, + "balance_loss_clip": 1.04447389, + "balance_loss_mlp": 1.02036595, + "epoch": 0.6680695003907894, + "flos": 29711985822720.0, + "grad_norm": 1.7539777547302515, + "language_loss": 0.63716698, + "learning_rate": 1.0485018663482555e-06, + "loss": 0.65877473, + "num_input_tokens_seen": 119598920, + "step": 5556, + "time_per_iteration": 2.5800023078918457 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01021948, + "balance_loss_clip": 1.04778492, + "balance_loss_mlp": 1.01407397, + "epoch": 0.6681897432814284, + "flos": 28218964083840.0, + "grad_norm": 2.540842597012571, + "language_loss": 0.70446754, + "learning_rate": 1.0478167698360354e-06, + "loss": 0.72622073, + "num_input_tokens_seen": 119618220, + "step": 5557, + "time_per_iteration": 2.4940547943115234 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01024418, + "balance_loss_clip": 1.04641485, + "balance_loss_mlp": 1.01664209, + "epoch": 0.6683099861720676, + "flos": 25046543911680.0, + "grad_norm": 2.6775474687521372, + "language_loss": 0.70299375, + "learning_rate": 1.0471318177692556e-06, + "loss": 0.72474235, + "num_input_tokens_seen": 119638520, + "step": 5558, + "time_per_iteration": 2.478252410888672 + }, + { + "auxiliary_loss_clip": 0.0111925, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.04527569, + "balance_loss_mlp": 1.02043796, + "epoch": 0.6684302290627067, + "flos": 22996977868800.0, + "grad_norm": 3.2440330190469435, + "language_loss": 0.76142788, + "learning_rate": 1.046447010251821e-06, + "loss": 0.78289914, + "num_input_tokens_seen": 119655850, + "step": 5559, + "time_per_iteration": 2.5569262504577637 + }, + { + "auxiliary_loss_clip": 0.01142883, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.04899716, + "balance_loss_mlp": 1.01816535, + "epoch": 0.6685504719533457, + "flos": 26573824247040.0, + "grad_norm": 2.138365777084513, + "language_loss": 0.75626755, + "learning_rate": 1.0457623473876157e-06, + "loss": 0.77794921, + "num_input_tokens_seen": 119675355, + "step": 5560, + "time_per_iteration": 2.578597068786621 + }, + { + "auxiliary_loss_clip": 0.01169231, + "auxiliary_loss_mlp": 0.01024956, + "balance_loss_clip": 1.04872036, + "balance_loss_mlp": 1.01811969, + "epoch": 0.6686707148439849, + "flos": 28986087870720.0, + "grad_norm": 1.868925726003123, + "language_loss": 0.71220756, + "learning_rate": 1.0450778292805046e-06, + "loss": 0.73414946, + "num_input_tokens_seen": 119695340, + "step": 5561, + "time_per_iteration": 2.4805686473846436 + }, + { + "auxiliary_loss_clip": 0.01159164, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.04660451, + "balance_loss_mlp": 1.01919794, + "epoch": 0.6687909577346239, + "flos": 23623152687360.0, + "grad_norm": 2.0247041898382103, + "language_loss": 0.78722423, + "learning_rate": 1.0443934560343267e-06, + "loss": 0.8090806, + "num_input_tokens_seen": 119716750, + "step": 5562, + "time_per_iteration": 3.313943862915039 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.04246783, + "balance_loss_mlp": 1.01709461, + "epoch": 0.668911200625263, + "flos": 23148593176320.0, + "grad_norm": 2.0556870240121534, + "language_loss": 0.78214395, + "learning_rate": 1.0437092277529034e-06, + "loss": 0.80354202, + "num_input_tokens_seen": 119736005, + "step": 5563, + "time_per_iteration": 2.525938034057617 + }, + { + "auxiliary_loss_clip": 0.01124505, + "auxiliary_loss_mlp": 0.01025881, + "balance_loss_clip": 1.04078555, + "balance_loss_mlp": 1.01880646, + "epoch": 0.6690314435159022, + "flos": 18551919853440.0, + "grad_norm": 2.0346624199803913, + "language_loss": 0.73141664, + "learning_rate": 1.0430251445400292e-06, + "loss": 0.75292051, + "num_input_tokens_seen": 119754050, + "step": 5564, + "time_per_iteration": 2.4879183769226074 + }, + { + "auxiliary_loss_clip": 0.0108693, + "auxiliary_loss_mlp": 0.01025199, + "balance_loss_clip": 1.04578424, + "balance_loss_mlp": 1.01773643, + "epoch": 0.6691516864065412, + "flos": 31759540704000.0, + "grad_norm": 2.29490684102977, + "language_loss": 0.62464035, + "learning_rate": 1.0423412064994787e-06, + "loss": 0.64576161, + "num_input_tokens_seen": 119774820, + "step": 5565, + "time_per_iteration": 2.6734249591827393 + }, + { + "auxiliary_loss_clip": 0.01130492, + "auxiliary_loss_mlp": 0.01023299, + "balance_loss_clip": 1.0444777, + "balance_loss_mlp": 1.01587188, + "epoch": 0.6692719292971803, + "flos": 34933864296960.0, + "grad_norm": 2.1299899439090906, + "language_loss": 0.73788476, + "learning_rate": 1.0416574137350064e-06, + "loss": 0.75942266, + "num_input_tokens_seen": 119795525, + "step": 5566, + "time_per_iteration": 3.489614963531494 + }, + { + "auxiliary_loss_clip": 0.01149282, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.04733348, + "balance_loss_mlp": 1.01975322, + "epoch": 0.6693921721878194, + "flos": 20449188230400.0, + "grad_norm": 2.0446985089643475, + "language_loss": 0.81325543, + "learning_rate": 1.0409737663503428e-06, + "loss": 0.83502233, + "num_input_tokens_seen": 119813905, + "step": 5567, + "time_per_iteration": 3.256037473678589 + }, + { + "auxiliary_loss_clip": 0.0115332, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.04498446, + "balance_loss_mlp": 1.02097762, + "epoch": 0.6695124150784585, + "flos": 16614538963200.0, + "grad_norm": 2.436174842291626, + "language_loss": 0.82967198, + "learning_rate": 1.040290264449196e-06, + "loss": 0.8514955, + "num_input_tokens_seen": 119832010, + "step": 5568, + "time_per_iteration": 2.4484620094299316 + }, + { + "auxiliary_loss_clip": 0.01152272, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.0483197, + "balance_loss_mlp": 1.02178299, + "epoch": 0.6696326579690975, + "flos": 26652145852800.0, + "grad_norm": 3.8486687727426134, + "language_loss": 0.63670647, + "learning_rate": 1.0396069081352532e-06, + "loss": 0.65851885, + "num_input_tokens_seen": 119851165, + "step": 5569, + "time_per_iteration": 3.2229678630828857 + }, + { + "auxiliary_loss_clip": 0.01068641, + "auxiliary_loss_mlp": 0.01002517, + "balance_loss_clip": 1.0130806, + "balance_loss_mlp": 1.00153947, + "epoch": 0.6697529008597367, + "flos": 66964603662720.0, + "grad_norm": 0.7728842107677465, + "language_loss": 0.56076854, + "learning_rate": 1.0389236975121782e-06, + "loss": 0.58148009, + "num_input_tokens_seen": 119906015, + "step": 5570, + "time_per_iteration": 2.9311363697052 + }, + { + "auxiliary_loss_clip": 0.01172728, + "auxiliary_loss_mlp": 0.01021099, + "balance_loss_clip": 1.04959297, + "balance_loss_mlp": 1.01331449, + "epoch": 0.6698731437503758, + "flos": 20886939279360.0, + "grad_norm": 2.2919833931668183, + "language_loss": 0.71307224, + "learning_rate": 1.0382406326836147e-06, + "loss": 0.7350105, + "num_input_tokens_seen": 119925160, + "step": 5571, + "time_per_iteration": 2.4139564037323 + }, + { + "auxiliary_loss_clip": 0.01163883, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.05001009, + "balance_loss_mlp": 1.01755416, + "epoch": 0.6699933866410148, + "flos": 20409470766720.0, + "grad_norm": 1.8400274426332732, + "language_loss": 0.7611568, + "learning_rate": 1.0375577137531828e-06, + "loss": 0.7830534, + "num_input_tokens_seen": 119943720, + "step": 5572, + "time_per_iteration": 2.4486207962036133 + }, + { + "auxiliary_loss_clip": 0.01146559, + "auxiliary_loss_mlp": 0.01025664, + "balance_loss_clip": 1.0486697, + "balance_loss_mlp": 1.01762354, + "epoch": 0.670113629531654, + "flos": 29023075900800.0, + "grad_norm": 1.5204870415256866, + "language_loss": 0.72181427, + "learning_rate": 1.0368749408244802e-06, + "loss": 0.74353653, + "num_input_tokens_seen": 119966640, + "step": 5573, + "time_per_iteration": 2.5619585514068604 + }, + { + "auxiliary_loss_clip": 0.0115276, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.04887307, + "balance_loss_mlp": 1.0206275, + "epoch": 0.670233872422293, + "flos": 19791699730560.0, + "grad_norm": 2.0509017342801608, + "language_loss": 0.78862309, + "learning_rate": 1.0361923140010836e-06, + "loss": 0.81043077, + "num_input_tokens_seen": 119985125, + "step": 5574, + "time_per_iteration": 2.4595985412597656 + }, + { + "auxiliary_loss_clip": 0.01158599, + "auxiliary_loss_mlp": 0.01021459, + "balance_loss_clip": 1.04609215, + "balance_loss_mlp": 1.01359069, + "epoch": 0.6703541153129321, + "flos": 24243689070720.0, + "grad_norm": 2.4843856840743532, + "language_loss": 0.63171947, + "learning_rate": 1.0355098333865455e-06, + "loss": 0.65351999, + "num_input_tokens_seen": 120004355, + "step": 5575, + "time_per_iteration": 2.474578380584717 + }, + { + "auxiliary_loss_clip": 0.01156168, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.05293179, + "balance_loss_mlp": 1.02114725, + "epoch": 0.6704743582035713, + "flos": 26688523351680.0, + "grad_norm": 1.5922564903622123, + "language_loss": 0.69083208, + "learning_rate": 1.0348274990844006e-06, + "loss": 0.71267807, + "num_input_tokens_seen": 120027115, + "step": 5576, + "time_per_iteration": 2.507566213607788 + }, + { + "auxiliary_loss_clip": 0.0115701, + "auxiliary_loss_mlp": 0.01026959, + "balance_loss_clip": 1.04970455, + "balance_loss_mlp": 1.01944304, + "epoch": 0.6705946010942103, + "flos": 23514379326720.0, + "grad_norm": 1.7193449444515767, + "language_loss": 0.73016864, + "learning_rate": 1.034145311198155e-06, + "loss": 0.75200832, + "num_input_tokens_seen": 120047130, + "step": 5577, + "time_per_iteration": 2.457211494445801 + }, + { + "auxiliary_loss_clip": 0.01166544, + "auxiliary_loss_mlp": 0.01020109, + "balance_loss_clip": 1.04788625, + "balance_loss_mlp": 1.01320028, + "epoch": 0.6707148439848494, + "flos": 24061011477120.0, + "grad_norm": 1.6692131200243168, + "language_loss": 0.6354475, + "learning_rate": 1.0334632698312989e-06, + "loss": 0.65731406, + "num_input_tokens_seen": 120067925, + "step": 5578, + "time_per_iteration": 2.4303925037384033 + }, + { + "auxiliary_loss_clip": 0.01135283, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.04557943, + "balance_loss_mlp": 1.02127075, + "epoch": 0.6708350868754885, + "flos": 22528667324160.0, + "grad_norm": 2.0549815873472648, + "language_loss": 0.75402439, + "learning_rate": 1.032781375087295e-06, + "loss": 0.77566946, + "num_input_tokens_seen": 120087825, + "step": 5579, + "time_per_iteration": 2.4749789237976074 + }, + { + "auxiliary_loss_clip": 0.01147297, + "auxiliary_loss_mlp": 0.01024672, + "balance_loss_clip": 1.05082631, + "balance_loss_mlp": 1.01757944, + "epoch": 0.6709553297661276, + "flos": 25227749047680.0, + "grad_norm": 1.795518310229102, + "language_loss": 0.67651784, + "learning_rate": 1.0320996270695891e-06, + "loss": 0.69823748, + "num_input_tokens_seen": 120108895, + "step": 5580, + "time_per_iteration": 2.531343936920166 + }, + { + "auxiliary_loss_clip": 0.01127814, + "auxiliary_loss_mlp": 0.01024539, + "balance_loss_clip": 1.04389405, + "balance_loss_mlp": 1.0167253, + "epoch": 0.6710755726567667, + "flos": 20448757267200.0, + "grad_norm": 1.7611695924655835, + "language_loss": 0.73460591, + "learning_rate": 1.0314180258815998e-06, + "loss": 0.75612944, + "num_input_tokens_seen": 120127535, + "step": 5581, + "time_per_iteration": 2.5152859687805176 + }, + { + "auxiliary_loss_clip": 0.01118559, + "auxiliary_loss_mlp": 0.01022809, + "balance_loss_clip": 1.04242873, + "balance_loss_mlp": 1.01573372, + "epoch": 0.6711958155474057, + "flos": 25995411538560.0, + "grad_norm": 1.5454506611844128, + "language_loss": 0.7423296, + "learning_rate": 1.0307365716267247e-06, + "loss": 0.76374328, + "num_input_tokens_seen": 120147980, + "step": 5582, + "time_per_iteration": 2.5693109035491943 + }, + { + "auxiliary_loss_clip": 0.01156179, + "auxiliary_loss_mlp": 0.01024722, + "balance_loss_clip": 1.04835892, + "balance_loss_mlp": 1.01736093, + "epoch": 0.6713160584380449, + "flos": 19937712516480.0, + "grad_norm": 1.9203590595304838, + "language_loss": 0.78199959, + "learning_rate": 1.0300552644083423e-06, + "loss": 0.80380857, + "num_input_tokens_seen": 120166905, + "step": 5583, + "time_per_iteration": 2.4478225708007812 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01022827, + "balance_loss_clip": 1.04927218, + "balance_loss_mlp": 1.01506698, + "epoch": 0.6714363013286839, + "flos": 18223373128320.0, + "grad_norm": 2.2636071456574576, + "language_loss": 0.72501814, + "learning_rate": 1.0293741043298036e-06, + "loss": 0.7466045, + "num_input_tokens_seen": 120185255, + "step": 5584, + "time_per_iteration": 2.4972448348999023 + }, + { + "auxiliary_loss_clip": 0.0113461, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.05077291, + "balance_loss_mlp": 1.02131486, + "epoch": 0.671556544219323, + "flos": 25812374808960.0, + "grad_norm": 4.039987393102375, + "language_loss": 0.71544707, + "learning_rate": 1.0286930914944436e-06, + "loss": 0.7370857, + "num_input_tokens_seen": 120205070, + "step": 5585, + "time_per_iteration": 2.544395923614502 + }, + { + "auxiliary_loss_clip": 0.01170176, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.04692054, + "balance_loss_mlp": 1.01794577, + "epoch": 0.6716767871099621, + "flos": 15850431918720.0, + "grad_norm": 2.4592464930007387, + "language_loss": 0.77125227, + "learning_rate": 1.0280122260055684e-06, + "loss": 0.7932055, + "num_input_tokens_seen": 120220780, + "step": 5586, + "time_per_iteration": 2.398461103439331 + }, + { + "auxiliary_loss_clip": 0.0117304, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.0507741, + "balance_loss_mlp": 1.02094603, + "epoch": 0.6717970300006012, + "flos": 19756112330880.0, + "grad_norm": 1.930092579841454, + "language_loss": 0.820786, + "learning_rate": 1.0273315079664652e-06, + "loss": 0.84280711, + "num_input_tokens_seen": 120238735, + "step": 5587, + "time_per_iteration": 2.4364752769470215 + }, + { + "auxiliary_loss_clip": 0.01158978, + "auxiliary_loss_mlp": 0.01022519, + "balance_loss_clip": 1.04962456, + "balance_loss_mlp": 1.01512241, + "epoch": 0.6719172728912403, + "flos": 25485049146240.0, + "grad_norm": 2.1537941645484624, + "language_loss": 0.74342352, + "learning_rate": 1.0266509374803992e-06, + "loss": 0.76523852, + "num_input_tokens_seen": 120259895, + "step": 5588, + "time_per_iteration": 2.4993505477905273 + }, + { + "auxiliary_loss_clip": 0.01170433, + "auxiliary_loss_mlp": 0.00762336, + "balance_loss_clip": 1.0483737, + "balance_loss_mlp": 1.00059915, + "epoch": 0.6720375157818794, + "flos": 15880344969600.0, + "grad_norm": 4.557218838866834, + "language_loss": 0.84566152, + "learning_rate": 1.0259705146506123e-06, + "loss": 0.86498922, + "num_input_tokens_seen": 120274790, + "step": 5589, + "time_per_iteration": 3.1294379234313965 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.04859781, + "balance_loss_mlp": 1.02023971, + "epoch": 0.6721577586725185, + "flos": 32010843231360.0, + "grad_norm": 2.0961986089760716, + "language_loss": 0.77502471, + "learning_rate": 1.025290239580324e-06, + "loss": 0.79689562, + "num_input_tokens_seen": 120295460, + "step": 5590, + "time_per_iteration": 2.5450472831726074 + }, + { + "auxiliary_loss_clip": 0.01114423, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.04314291, + "balance_loss_mlp": 1.02130306, + "epoch": 0.6722780015631575, + "flos": 20737873837440.0, + "grad_norm": 1.6192387201139165, + "language_loss": 0.75764674, + "learning_rate": 1.0246101123727313e-06, + "loss": 0.77907741, + "num_input_tokens_seen": 120314440, + "step": 5591, + "time_per_iteration": 2.5433592796325684 + }, + { + "auxiliary_loss_clip": 0.01155405, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.04615891, + "balance_loss_mlp": 1.023839, + "epoch": 0.6723982444537967, + "flos": 16909617191040.0, + "grad_norm": 1.8983292224355133, + "language_loss": 0.78469092, + "learning_rate": 1.0239301331310085e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 120332060, + "step": 5592, + "time_per_iteration": 3.2444515228271484 + }, + { + "auxiliary_loss_clip": 0.01153206, + "auxiliary_loss_mlp": 0.01025997, + "balance_loss_clip": 1.04712093, + "balance_loss_mlp": 1.01868939, + "epoch": 0.6725184873444358, + "flos": 20667812359680.0, + "grad_norm": 1.6893222952765115, + "language_loss": 0.88701761, + "learning_rate": 1.0232503019583088e-06, + "loss": 0.90880966, + "num_input_tokens_seen": 120351670, + "step": 5593, + "time_per_iteration": 2.4609792232513428 + }, + { + "auxiliary_loss_clip": 0.01151919, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04745698, + "balance_loss_mlp": 1.01968396, + "epoch": 0.6726387302350748, + "flos": 23727616416000.0, + "grad_norm": 1.8391984003790942, + "language_loss": 0.69723374, + "learning_rate": 1.0225706189577619e-06, + "loss": 0.71902609, + "num_input_tokens_seen": 120370195, + "step": 5594, + "time_per_iteration": 3.295260190963745 + }, + { + "auxiliary_loss_clip": 0.01158551, + "auxiliary_loss_mlp": 0.01025529, + "balance_loss_clip": 1.04849505, + "balance_loss_mlp": 1.01766157, + "epoch": 0.672758973125714, + "flos": 15188274650880.0, + "grad_norm": 2.4426036981054753, + "language_loss": 0.7461617, + "learning_rate": 1.021891084232475e-06, + "loss": 0.76800251, + "num_input_tokens_seen": 120388130, + "step": 5595, + "time_per_iteration": 2.416186571121216 + }, + { + "auxiliary_loss_clip": 0.01154593, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.04541349, + "balance_loss_mlp": 1.01931763, + "epoch": 0.672879216016353, + "flos": 18077252601600.0, + "grad_norm": 3.0578557889030766, + "language_loss": 0.80070502, + "learning_rate": 1.0212116978855325e-06, + "loss": 0.82252216, + "num_input_tokens_seen": 120406145, + "step": 5596, + "time_per_iteration": 3.1221911907196045 + }, + { + "auxiliary_loss_clip": 0.01126155, + "auxiliary_loss_mlp": 0.01021038, + "balance_loss_clip": 1.04460335, + "balance_loss_mlp": 1.01410341, + "epoch": 0.6729994589069921, + "flos": 23476349802240.0, + "grad_norm": 1.710851739438485, + "language_loss": 0.78964907, + "learning_rate": 1.020532460019997e-06, + "loss": 0.81112099, + "num_input_tokens_seen": 120425395, + "step": 5597, + "time_per_iteration": 2.524912118911743 + }, + { + "auxiliary_loss_clip": 0.01091593, + "auxiliary_loss_mlp": 0.01026481, + "balance_loss_clip": 1.04280186, + "balance_loss_mlp": 1.01897693, + "epoch": 0.6731197017976313, + "flos": 26322018929280.0, + "grad_norm": 1.9175062700451448, + "language_loss": 0.71083069, + "learning_rate": 1.0198533707389096e-06, + "loss": 0.73201144, + "num_input_tokens_seen": 120446270, + "step": 5598, + "time_per_iteration": 2.6498665809631348 + }, + { + "auxiliary_loss_clip": 0.01153994, + "auxiliary_loss_mlp": 0.00762519, + "balance_loss_clip": 1.04830921, + "balance_loss_mlp": 1.00066662, + "epoch": 0.6732399446882703, + "flos": 21616428591360.0, + "grad_norm": 4.479655399412542, + "language_loss": 0.73157728, + "learning_rate": 1.0191744301452853e-06, + "loss": 0.75074244, + "num_input_tokens_seen": 120465570, + "step": 5599, + "time_per_iteration": 2.437769889831543 + }, + { + "auxiliary_loss_clip": 0.0116932, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.04865634, + "balance_loss_mlp": 1.01938212, + "epoch": 0.6733601875789094, + "flos": 25880173729920.0, + "grad_norm": 1.6607557564656918, + "language_loss": 0.70416641, + "learning_rate": 1.0184956383421208e-06, + "loss": 0.72612834, + "num_input_tokens_seen": 120484220, + "step": 5600, + "time_per_iteration": 2.4456331729888916 + }, + { + "auxiliary_loss_clip": 0.01157701, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.04857826, + "balance_loss_mlp": 1.01795614, + "epoch": 0.6734804304695485, + "flos": 22929573997440.0, + "grad_norm": 1.9796694561357622, + "language_loss": 0.65360671, + "learning_rate": 1.017816995432387e-06, + "loss": 0.67543745, + "num_input_tokens_seen": 120503320, + "step": 5601, + "time_per_iteration": 2.464118719100952 + }, + { + "auxiliary_loss_clip": 0.01142149, + "auxiliary_loss_mlp": 0.01023438, + "balance_loss_clip": 1.04594636, + "balance_loss_mlp": 1.0158149, + "epoch": 0.6736006733601876, + "flos": 18697968552960.0, + "grad_norm": 2.2448856128377646, + "language_loss": 0.74459243, + "learning_rate": 1.0171385015190353e-06, + "loss": 0.76624835, + "num_input_tokens_seen": 120523180, + "step": 5602, + "time_per_iteration": 2.466320276260376 + }, + { + "auxiliary_loss_clip": 0.01140067, + "auxiliary_loss_mlp": 0.00762745, + "balance_loss_clip": 1.05057693, + "balance_loss_mlp": 1.00071073, + "epoch": 0.6737209162508266, + "flos": 19427745173760.0, + "grad_norm": 2.108869883709841, + "language_loss": 0.7301839, + "learning_rate": 1.0164601567049908e-06, + "loss": 0.74921191, + "num_input_tokens_seen": 120541710, + "step": 5603, + "time_per_iteration": 2.4771969318389893 + }, + { + "auxiliary_loss_clip": 0.01142128, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.04705882, + "balance_loss_mlp": 1.0188005, + "epoch": 0.6738411591414658, + "flos": 20158060498560.0, + "grad_norm": 1.5791783503968118, + "language_loss": 0.8043083, + "learning_rate": 1.015781961093158e-06, + "loss": 0.82599723, + "num_input_tokens_seen": 120561030, + "step": 5604, + "time_per_iteration": 2.4761765003204346 + }, + { + "auxiliary_loss_clip": 0.01143991, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.04299426, + "balance_loss_mlp": 1.01859403, + "epoch": 0.6739614020321049, + "flos": 21653847584640.0, + "grad_norm": 1.9417755597284516, + "language_loss": 0.77187514, + "learning_rate": 1.0151039147864197e-06, + "loss": 0.79357564, + "num_input_tokens_seen": 120581005, + "step": 5605, + "time_per_iteration": 2.4847517013549805 + }, + { + "auxiliary_loss_clip": 0.010836, + "auxiliary_loss_mlp": 0.01023915, + "balance_loss_clip": 1.04823756, + "balance_loss_mlp": 1.01589823, + "epoch": 0.6740816449227439, + "flos": 19171702051200.0, + "grad_norm": 3.8051607042570157, + "language_loss": 0.65878797, + "learning_rate": 1.0144260178876336e-06, + "loss": 0.6798631, + "num_input_tokens_seen": 120600350, + "step": 5606, + "time_per_iteration": 2.6127288341522217 + }, + { + "auxiliary_loss_clip": 0.01147329, + "auxiliary_loss_mlp": 0.01020958, + "balance_loss_clip": 1.04611468, + "balance_loss_mlp": 1.01394498, + "epoch": 0.6742018878133831, + "flos": 21097015971840.0, + "grad_norm": 2.5932104089263657, + "language_loss": 0.67258775, + "learning_rate": 1.0137482704996388e-06, + "loss": 0.69427061, + "num_input_tokens_seen": 120614700, + "step": 5607, + "time_per_iteration": 2.466639518737793 + }, + { + "auxiliary_loss_clip": 0.01129851, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.04481328, + "balance_loss_mlp": 1.01980269, + "epoch": 0.6743221307040221, + "flos": 23549966726400.0, + "grad_norm": 4.481755641195638, + "language_loss": 0.79232001, + "learning_rate": 1.0130706727252461e-06, + "loss": 0.81389338, + "num_input_tokens_seen": 120631755, + "step": 5608, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.0113129, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.04560113, + "balance_loss_mlp": 1.02046633, + "epoch": 0.6744423735946612, + "flos": 16249542912000.0, + "grad_norm": 2.7969143063846333, + "language_loss": 0.68080229, + "learning_rate": 1.0123932246672468e-06, + "loss": 0.70239544, + "num_input_tokens_seen": 120645900, + "step": 5609, + "time_per_iteration": 2.4734508991241455 + }, + { + "auxiliary_loss_clip": 0.01026974, + "auxiliary_loss_mlp": 0.00753028, + "balance_loss_clip": 1.01471448, + "balance_loss_mlp": 0.9998517, + "epoch": 0.6745626164853004, + "flos": 57843257829120.0, + "grad_norm": 0.7804044379563917, + "language_loss": 0.55836391, + "learning_rate": 1.0117159264284114e-06, + "loss": 0.57616389, + "num_input_tokens_seen": 120709070, + "step": 5610, + "time_per_iteration": 3.112058162689209 + }, + { + "auxiliary_loss_clip": 0.011472, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.04830813, + "balance_loss_mlp": 1.0180583, + "epoch": 0.6746828593759394, + "flos": 20485027025280.0, + "grad_norm": 1.8025215069265428, + "language_loss": 0.77049118, + "learning_rate": 1.0110387781114837e-06, + "loss": 0.79221511, + "num_input_tokens_seen": 120727685, + "step": 5611, + "time_per_iteration": 2.481444835662842 + }, + { + "auxiliary_loss_clip": 0.01167564, + "auxiliary_loss_mlp": 0.01026111, + "balance_loss_clip": 1.0479244, + "balance_loss_mlp": 1.01867855, + "epoch": 0.6748031022665785, + "flos": 19208223204480.0, + "grad_norm": 2.1587504679037273, + "language_loss": 0.77094889, + "learning_rate": 1.0103617798191872e-06, + "loss": 0.7928856, + "num_input_tokens_seen": 120747160, + "step": 5612, + "time_per_iteration": 2.4058477878570557 + }, + { + "auxiliary_loss_clip": 0.01141566, + "auxiliary_loss_mlp": 0.01023379, + "balance_loss_clip": 1.04869998, + "balance_loss_mlp": 1.01558232, + "epoch": 0.6749233451572175, + "flos": 15195026407680.0, + "grad_norm": 2.550253404705486, + "language_loss": 0.82835215, + "learning_rate": 1.0096849316542217e-06, + "loss": 0.85000163, + "num_input_tokens_seen": 120763710, + "step": 5613, + "time_per_iteration": 2.4668567180633545 + }, + { + "auxiliary_loss_clip": 0.01073191, + "auxiliary_loss_mlp": 0.01020718, + "balance_loss_clip": 1.03747666, + "balance_loss_mlp": 1.01296961, + "epoch": 0.6750435880478567, + "flos": 26499489050880.0, + "grad_norm": 2.187206510248746, + "language_loss": 0.74766231, + "learning_rate": 1.0090082337192643e-06, + "loss": 0.76860142, + "num_input_tokens_seen": 120783355, + "step": 5614, + "time_per_iteration": 2.652745246887207 + }, + { + "auxiliary_loss_clip": 0.01091801, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.03603482, + "balance_loss_mlp": 1.01978159, + "epoch": 0.6751638309384957, + "flos": 23404313076480.0, + "grad_norm": 1.9790381324417856, + "language_loss": 0.78248245, + "learning_rate": 1.0083316861169705e-06, + "loss": 0.80367184, + "num_input_tokens_seen": 120802090, + "step": 5615, + "time_per_iteration": 2.5952847003936768 + }, + { + "auxiliary_loss_clip": 0.01131985, + "auxiliary_loss_mlp": 0.01023283, + "balance_loss_clip": 1.04292858, + "balance_loss_mlp": 1.01494479, + "epoch": 0.6752840738291348, + "flos": 23441408847360.0, + "grad_norm": 2.5462130476113334, + "language_loss": 0.71312702, + "learning_rate": 1.0076552889499713e-06, + "loss": 0.73467976, + "num_input_tokens_seen": 120822855, + "step": 5616, + "time_per_iteration": 3.2727737426757812 + }, + { + "auxiliary_loss_clip": 0.01156204, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.0497458, + "balance_loss_mlp": 1.01835907, + "epoch": 0.675404316719774, + "flos": 30335826257280.0, + "grad_norm": 2.3543251133200545, + "language_loss": 0.73736441, + "learning_rate": 1.006979042320876e-06, + "loss": 0.75917774, + "num_input_tokens_seen": 120843070, + "step": 5617, + "time_per_iteration": 2.512648344039917 + }, + { + "auxiliary_loss_clip": 0.01137699, + "auxiliary_loss_mlp": 0.0102067, + "balance_loss_clip": 1.04344237, + "balance_loss_mlp": 1.01275742, + "epoch": 0.675524559610413, + "flos": 23622613983360.0, + "grad_norm": 2.268019991433727, + "language_loss": 0.62844241, + "learning_rate": 1.0063029463322702e-06, + "loss": 0.65002608, + "num_input_tokens_seen": 120863345, + "step": 5618, + "time_per_iteration": 2.510601282119751 + }, + { + "auxiliary_loss_clip": 0.01107286, + "auxiliary_loss_mlp": 0.00762807, + "balance_loss_clip": 1.04074073, + "balance_loss_mlp": 1.00069857, + "epoch": 0.6756448025010521, + "flos": 21248631279360.0, + "grad_norm": 2.3227387283972925, + "language_loss": 0.752038, + "learning_rate": 1.0056270010867164e-06, + "loss": 0.7707389, + "num_input_tokens_seen": 120880915, + "step": 5619, + "time_per_iteration": 3.3766841888427734 + }, + { + "auxiliary_loss_clip": 0.01140258, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.04161584, + "balance_loss_mlp": 1.02000773, + "epoch": 0.6757650453916912, + "flos": 21646521210240.0, + "grad_norm": 2.4100007711594023, + "language_loss": 0.77961367, + "learning_rate": 1.004951206686758e-06, + "loss": 0.80129623, + "num_input_tokens_seen": 120899190, + "step": 5620, + "time_per_iteration": 3.291287660598755 + }, + { + "auxiliary_loss_clip": 0.01150151, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.04589152, + "balance_loss_mlp": 1.02357376, + "epoch": 0.6758852882823303, + "flos": 21795658479360.0, + "grad_norm": 2.526588143775918, + "language_loss": 0.71496648, + "learning_rate": 1.0042755632349087e-06, + "loss": 0.73677897, + "num_input_tokens_seen": 120916080, + "step": 5621, + "time_per_iteration": 2.4356589317321777 + }, + { + "auxiliary_loss_clip": 0.0112673, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.04364705, + "balance_loss_mlp": 1.02014494, + "epoch": 0.6760055311729694, + "flos": 27088783580160.0, + "grad_norm": 2.2547904810140884, + "language_loss": 0.62818456, + "learning_rate": 1.0036000708336653e-06, + "loss": 0.64973104, + "num_input_tokens_seen": 120935210, + "step": 5622, + "time_per_iteration": 3.3229434490203857 + }, + { + "auxiliary_loss_clip": 0.01145278, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.04803848, + "balance_loss_mlp": 1.02096009, + "epoch": 0.6761257740636085, + "flos": 17999792922240.0, + "grad_norm": 2.546558619472363, + "language_loss": 0.79283339, + "learning_rate": 1.0029247295854984e-06, + "loss": 0.8145721, + "num_input_tokens_seen": 120951830, + "step": 5623, + "time_per_iteration": 2.4617576599121094 + }, + { + "auxiliary_loss_clip": 0.01134581, + "auxiliary_loss_mlp": 0.01025958, + "balance_loss_clip": 1.04796314, + "balance_loss_mlp": 1.0190202, + "epoch": 0.6762460169542476, + "flos": 15121912273920.0, + "grad_norm": 2.0280833066145294, + "language_loss": 0.71425265, + "learning_rate": 1.0022495395928588e-06, + "loss": 0.73585802, + "num_input_tokens_seen": 120970310, + "step": 5624, + "time_per_iteration": 2.5078158378601074 + }, + { + "auxiliary_loss_clip": 0.01070963, + "auxiliary_loss_mlp": 0.01002205, + "balance_loss_clip": 1.01545882, + "balance_loss_mlp": 1.00110233, + "epoch": 0.6763662598448866, + "flos": 67886970030720.0, + "grad_norm": 0.7908352360251991, + "language_loss": 0.62347364, + "learning_rate": 1.0015745009581697e-06, + "loss": 0.64420533, + "num_input_tokens_seen": 121031915, + "step": 5625, + "time_per_iteration": 3.0608749389648438 + }, + { + "auxiliary_loss_clip": 0.01153176, + "auxiliary_loss_mlp": 0.01023455, + "balance_loss_clip": 1.04903865, + "balance_loss_mlp": 1.01634443, + "epoch": 0.6764865027355258, + "flos": 20631829910400.0, + "grad_norm": 1.8648441227093218, + "language_loss": 0.66912121, + "learning_rate": 1.0008996137838343e-06, + "loss": 0.69088751, + "num_input_tokens_seen": 121050890, + "step": 5626, + "time_per_iteration": 2.4541313648223877 + }, + { + "auxiliary_loss_clip": 0.01174715, + "auxiliary_loss_mlp": 0.01026978, + "balance_loss_clip": 1.05032587, + "balance_loss_mlp": 1.01912785, + "epoch": 0.6766067456261649, + "flos": 21215809226880.0, + "grad_norm": 1.9934190685119748, + "language_loss": 0.79698277, + "learning_rate": 1.000224878172234e-06, + "loss": 0.81899965, + "num_input_tokens_seen": 121070015, + "step": 5627, + "time_per_iteration": 2.4292218685150146 + }, + { + "auxiliary_loss_clip": 0.01158191, + "auxiliary_loss_mlp": 0.01024436, + "balance_loss_clip": 1.04711056, + "balance_loss_mlp": 1.01689017, + "epoch": 0.6767269885168039, + "flos": 19938251220480.0, + "grad_norm": 5.117672667238519, + "language_loss": 0.72491854, + "learning_rate": 9.99550294225724e-07, + "loss": 0.74674487, + "num_input_tokens_seen": 121089170, + "step": 5628, + "time_per_iteration": 2.4629600048065186 + }, + { + "auxiliary_loss_clip": 0.01117135, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.04237509, + "balance_loss_mlp": 1.02004123, + "epoch": 0.6768472314074431, + "flos": 20814076540800.0, + "grad_norm": 2.005189705533757, + "language_loss": 0.72329581, + "learning_rate": 9.988758620466402e-07, + "loss": 0.74474722, + "num_input_tokens_seen": 121108040, + "step": 5629, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01024231, + "balance_loss_clip": 1.04010439, + "balance_loss_mlp": 1.01713777, + "epoch": 0.6769674742980821, + "flos": 23186012169600.0, + "grad_norm": 1.5534478381106351, + "language_loss": 0.76141167, + "learning_rate": 9.982015817372917e-07, + "loss": 0.78268504, + "num_input_tokens_seen": 121128480, + "step": 5630, + "time_per_iteration": 2.6117234230041504 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.03948462, + "balance_loss_mlp": 1.01715136, + "epoch": 0.6770877171887212, + "flos": 24242934885120.0, + "grad_norm": 2.455341481587658, + "language_loss": 0.81844622, + "learning_rate": 9.975274533999657e-07, + "loss": 0.83979434, + "num_input_tokens_seen": 121148010, + "step": 5631, + "time_per_iteration": 2.5707125663757324 + }, + { + "auxiliary_loss_clip": 0.01171228, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.04837656, + "balance_loss_mlp": 1.02057219, + "epoch": 0.6772079600793603, + "flos": 18141567903360.0, + "grad_norm": 2.783368541098781, + "language_loss": 0.83714271, + "learning_rate": 9.96853477136929e-07, + "loss": 0.8591404, + "num_input_tokens_seen": 121162755, + "step": 5632, + "time_per_iteration": 2.3982958793640137 + }, + { + "auxiliary_loss_clip": 0.01119539, + "auxiliary_loss_mlp": 0.01023011, + "balance_loss_clip": 1.04142535, + "balance_loss_mlp": 1.01550364, + "epoch": 0.6773282029699994, + "flos": 22452069571200.0, + "grad_norm": 2.588647034896108, + "language_loss": 0.75288522, + "learning_rate": 9.96179653050422e-07, + "loss": 0.77431077, + "num_input_tokens_seen": 121182915, + "step": 5633, + "time_per_iteration": 2.650327682495117 + }, + { + "auxiliary_loss_clip": 0.0112272, + "auxiliary_loss_mlp": 0.01024975, + "balance_loss_clip": 1.04434836, + "balance_loss_mlp": 1.0171752, + "epoch": 0.6774484458606385, + "flos": 18693730748160.0, + "grad_norm": 2.1216665205693612, + "language_loss": 0.73863685, + "learning_rate": 9.955059812426635e-07, + "loss": 0.76011372, + "num_input_tokens_seen": 121200445, + "step": 5634, + "time_per_iteration": 2.4857876300811768 + }, + { + "auxiliary_loss_clip": 0.01172598, + "auxiliary_loss_mlp": 0.01025694, + "balance_loss_clip": 1.05231726, + "balance_loss_mlp": 1.01793623, + "epoch": 0.6775686887512776, + "flos": 25994046821760.0, + "grad_norm": 3.080649696603547, + "language_loss": 0.82902557, + "learning_rate": 9.948324618158493e-07, + "loss": 0.85100847, + "num_input_tokens_seen": 121220785, + "step": 5635, + "time_per_iteration": 2.453493118286133 + }, + { + "auxiliary_loss_clip": 0.01156853, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.04499531, + "balance_loss_mlp": 1.01935267, + "epoch": 0.6776889316419167, + "flos": 13587987922560.0, + "grad_norm": 2.1913885316081774, + "language_loss": 0.77460003, + "learning_rate": 9.941590948721502e-07, + "loss": 0.79643929, + "num_input_tokens_seen": 121237985, + "step": 5636, + "time_per_iteration": 2.4147961139678955 + }, + { + "auxiliary_loss_clip": 0.01136668, + "auxiliary_loss_mlp": 0.01023797, + "balance_loss_clip": 1.04621148, + "balance_loss_mlp": 1.01712763, + "epoch": 0.6778091745325557, + "flos": 27601121220480.0, + "grad_norm": 1.6560030482033634, + "language_loss": 0.76536179, + "learning_rate": 9.934858805137188e-07, + "loss": 0.78696638, + "num_input_tokens_seen": 121258635, + "step": 5637, + "time_per_iteration": 2.535013437271118 + }, + { + "auxiliary_loss_clip": 0.01149996, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.04593265, + "balance_loss_mlp": 1.01905704, + "epoch": 0.6779294174231949, + "flos": 18734058743040.0, + "grad_norm": 1.6317033481784171, + "language_loss": 0.80404937, + "learning_rate": 9.92812818842677e-07, + "loss": 0.82581282, + "num_input_tokens_seen": 121277810, + "step": 5638, + "time_per_iteration": 2.4280025959014893 + }, + { + "auxiliary_loss_clip": 0.01154159, + "auxiliary_loss_mlp": 0.01024871, + "balance_loss_clip": 1.04875386, + "balance_loss_mlp": 1.01741755, + "epoch": 0.678049660313834, + "flos": 45873797765760.0, + "grad_norm": 1.728243752163979, + "language_loss": 0.64215094, + "learning_rate": 9.921399099611306e-07, + "loss": 0.66394126, + "num_input_tokens_seen": 121298975, + "step": 5639, + "time_per_iteration": 2.7496955394744873 + }, + { + "auxiliary_loss_clip": 0.01131616, + "auxiliary_loss_mlp": 0.00762256, + "balance_loss_clip": 1.04502487, + "balance_loss_mlp": 1.00069189, + "epoch": 0.678169903204473, + "flos": 19974556892160.0, + "grad_norm": 1.6785396115507052, + "language_loss": 0.69072688, + "learning_rate": 9.914671539711588e-07, + "loss": 0.70966566, + "num_input_tokens_seen": 121318495, + "step": 5640, + "time_per_iteration": 2.531480073928833 + }, + { + "auxiliary_loss_clip": 0.01082379, + "auxiliary_loss_mlp": 0.01023493, + "balance_loss_clip": 1.04075789, + "balance_loss_mlp": 1.01645339, + "epoch": 0.6782901460951122, + "flos": 21395613732480.0, + "grad_norm": 2.152253398748627, + "language_loss": 0.78218687, + "learning_rate": 9.90794550974817e-07, + "loss": 0.80324554, + "num_input_tokens_seen": 121338890, + "step": 5641, + "time_per_iteration": 2.6066811084747314 + }, + { + "auxiliary_loss_clip": 0.01122211, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.04400957, + "balance_loss_mlp": 1.0220263, + "epoch": 0.6784103889857512, + "flos": 21434002392960.0, + "grad_norm": 3.9983651578338155, + "language_loss": 0.81072259, + "learning_rate": 9.901221010741407e-07, + "loss": 0.83224529, + "num_input_tokens_seen": 121358210, + "step": 5642, + "time_per_iteration": 3.3362085819244385 + }, + { + "auxiliary_loss_clip": 0.01159912, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.04770613, + "balance_loss_mlp": 1.01973498, + "epoch": 0.6785306318763903, + "flos": 32671923091200.0, + "grad_norm": 1.8751191017556763, + "language_loss": 0.74953198, + "learning_rate": 9.894498043711375e-07, + "loss": 0.77140081, + "num_input_tokens_seen": 121379955, + "step": 5643, + "time_per_iteration": 2.5505597591400146 + }, + { + "auxiliary_loss_clip": 0.01139171, + "auxiliary_loss_mlp": 0.01023712, + "balance_loss_clip": 1.04471803, + "balance_loss_mlp": 1.01592207, + "epoch": 0.6786508747670293, + "flos": 25632139340160.0, + "grad_norm": 2.6386515211156576, + "language_loss": 0.69506228, + "learning_rate": 9.887776609677962e-07, + "loss": 0.71669114, + "num_input_tokens_seen": 121401325, + "step": 5644, + "time_per_iteration": 2.537651300430298 + }, + { + "auxiliary_loss_clip": 0.01116392, + "auxiliary_loss_mlp": 0.01023004, + "balance_loss_clip": 1.03983223, + "balance_loss_mlp": 1.01574421, + "epoch": 0.6787711176576685, + "flos": 19171881619200.0, + "grad_norm": 1.6030564398697167, + "language_loss": 0.72312951, + "learning_rate": 9.88105670966079e-07, + "loss": 0.74452347, + "num_input_tokens_seen": 121419785, + "step": 5645, + "time_per_iteration": 3.385706663131714 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01023297, + "balance_loss_clip": 1.04292321, + "balance_loss_mlp": 1.01652002, + "epoch": 0.6788913605483076, + "flos": 13985159581440.0, + "grad_norm": 1.8973781655752906, + "language_loss": 0.78938323, + "learning_rate": 9.874338344679283e-07, + "loss": 0.81062281, + "num_input_tokens_seen": 121435630, + "step": 5646, + "time_per_iteration": 3.346597671508789 + }, + { + "auxiliary_loss_clip": 0.01166025, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.0481391, + "balance_loss_mlp": 1.01830745, + "epoch": 0.6790116034389466, + "flos": 22017586659840.0, + "grad_norm": 1.6880262874068503, + "language_loss": 0.73913878, + "learning_rate": 9.86762151575259e-07, + "loss": 0.76105142, + "num_input_tokens_seen": 121455625, + "step": 5647, + "time_per_iteration": 2.4226245880126953 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.00761737, + "balance_loss_clip": 1.04470301, + "balance_loss_mlp": 1.00074291, + "epoch": 0.6791318463295858, + "flos": 20922454851840.0, + "grad_norm": 1.489565348735229, + "language_loss": 0.80265582, + "learning_rate": 9.860906223899651e-07, + "loss": 0.8214106, + "num_input_tokens_seen": 121475020, + "step": 5648, + "time_per_iteration": 2.552877902984619 + }, + { + "auxiliary_loss_clip": 0.01146673, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.04672384, + "balance_loss_mlp": 1.02049041, + "epoch": 0.6792520892202248, + "flos": 28512749422080.0, + "grad_norm": 1.736217289933732, + "language_loss": 0.75607276, + "learning_rate": 9.854192470139184e-07, + "loss": 0.77781594, + "num_input_tokens_seen": 121496500, + "step": 5649, + "time_per_iteration": 3.318429708480835 + }, + { + "auxiliary_loss_clip": 0.01139207, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.04700947, + "balance_loss_mlp": 1.02064776, + "epoch": 0.6793723321108639, + "flos": 20011904058240.0, + "grad_norm": 2.1702929071846317, + "language_loss": 0.71754479, + "learning_rate": 9.847480255489645e-07, + "loss": 0.73921072, + "num_input_tokens_seen": 121515525, + "step": 5650, + "time_per_iteration": 2.4825146198272705 + }, + { + "auxiliary_loss_clip": 0.01144409, + "auxiliary_loss_mlp": 0.01023607, + "balance_loss_clip": 1.04576802, + "balance_loss_mlp": 1.01654983, + "epoch": 0.6794925750015031, + "flos": 26649488246400.0, + "grad_norm": 2.8997197262430947, + "language_loss": 0.69172394, + "learning_rate": 9.840769580969295e-07, + "loss": 0.71340412, + "num_input_tokens_seen": 121535965, + "step": 5651, + "time_per_iteration": 2.545766830444336 + }, + { + "auxiliary_loss_clip": 0.01147496, + "auxiliary_loss_mlp": 0.01022858, + "balance_loss_clip": 1.04616284, + "balance_loss_mlp": 1.01576543, + "epoch": 0.6796128178921421, + "flos": 21580374314880.0, + "grad_norm": 1.863022121432462, + "language_loss": 0.79777551, + "learning_rate": 9.834060447596114e-07, + "loss": 0.81947905, + "num_input_tokens_seen": 121555235, + "step": 5652, + "time_per_iteration": 2.531484603881836 + }, + { + "auxiliary_loss_clip": 0.01156653, + "auxiliary_loss_mlp": 0.01022512, + "balance_loss_clip": 1.04559255, + "balance_loss_mlp": 1.01466799, + "epoch": 0.6797330607827812, + "flos": 22492002516480.0, + "grad_norm": 2.1144039437172575, + "language_loss": 0.77824962, + "learning_rate": 9.827352856387868e-07, + "loss": 0.80004132, + "num_input_tokens_seen": 121574945, + "step": 5653, + "time_per_iteration": 2.479072332382202 + }, + { + "auxiliary_loss_clip": 0.01021827, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.01295829, + "balance_loss_mlp": 1.00139046, + "epoch": 0.6798533036734203, + "flos": 66306648286080.0, + "grad_norm": 0.7751288183603556, + "language_loss": 0.64240062, + "learning_rate": 9.820646808362118e-07, + "loss": 0.66264367, + "num_input_tokens_seen": 121641200, + "step": 5654, + "time_per_iteration": 3.2031667232513428 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.04510152, + "balance_loss_mlp": 1.01930022, + "epoch": 0.6799735465640594, + "flos": 16180163792640.0, + "grad_norm": 2.072371955800868, + "language_loss": 0.72879732, + "learning_rate": 9.813942304536154e-07, + "loss": 0.75042641, + "num_input_tokens_seen": 121659170, + "step": 5655, + "time_per_iteration": 2.459920644760132 + }, + { + "auxiliary_loss_clip": 0.01141359, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.04512191, + "balance_loss_mlp": 1.02179813, + "epoch": 0.6800937894546984, + "flos": 22125749489280.0, + "grad_norm": 1.7380389302859622, + "language_loss": 0.63639247, + "learning_rate": 9.807239345927043e-07, + "loss": 0.65809464, + "num_input_tokens_seen": 121679180, + "step": 5656, + "time_per_iteration": 2.489053249359131 + }, + { + "auxiliary_loss_clip": 0.01142877, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.042503, + "balance_loss_mlp": 1.01863456, + "epoch": 0.6802140323453376, + "flos": 31612953300480.0, + "grad_norm": 2.528153415639979, + "language_loss": 0.72309339, + "learning_rate": 9.80053793355162e-07, + "loss": 0.74478483, + "num_input_tokens_seen": 121697875, + "step": 5657, + "time_per_iteration": 2.5537240505218506 + }, + { + "auxiliary_loss_clip": 0.01109629, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04368401, + "balance_loss_mlp": 1.01950884, + "epoch": 0.6803342752359767, + "flos": 17712938908800.0, + "grad_norm": 2.1152385311104633, + "language_loss": 0.7483629, + "learning_rate": 9.793838068426472e-07, + "loss": 0.76973438, + "num_input_tokens_seen": 121715570, + "step": 5658, + "time_per_iteration": 2.50657057762146 + }, + { + "auxiliary_loss_clip": 0.01168706, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.04902971, + "balance_loss_mlp": 1.01799059, + "epoch": 0.6804545181266157, + "flos": 11326800902400.0, + "grad_norm": 2.6904177023535283, + "language_loss": 0.60966849, + "learning_rate": 9.78713975156799e-07, + "loss": 0.63161087, + "num_input_tokens_seen": 121731435, + "step": 5659, + "time_per_iteration": 2.4328441619873047 + }, + { + "auxiliary_loss_clip": 0.01130212, + "auxiliary_loss_mlp": 0.01024989, + "balance_loss_clip": 1.04981244, + "balance_loss_mlp": 1.01722813, + "epoch": 0.6805747610172549, + "flos": 29350976181120.0, + "grad_norm": 1.6504029865941823, + "language_loss": 0.71490681, + "learning_rate": 9.780442983992273e-07, + "loss": 0.73645884, + "num_input_tokens_seen": 121749950, + "step": 5660, + "time_per_iteration": 2.57289719581604 + }, + { + "auxiliary_loss_clip": 0.01135305, + "auxiliary_loss_mlp": 0.01025417, + "balance_loss_clip": 1.04475737, + "balance_loss_mlp": 1.01796675, + "epoch": 0.680695003907894, + "flos": 37631868612480.0, + "grad_norm": 1.707315684940208, + "language_loss": 0.71813023, + "learning_rate": 9.773747766715238e-07, + "loss": 0.73973745, + "num_input_tokens_seen": 121770770, + "step": 5661, + "time_per_iteration": 2.6178576946258545 + }, + { + "auxiliary_loss_clip": 0.01141844, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.04405189, + "balance_loss_mlp": 1.01876211, + "epoch": 0.680815246798533, + "flos": 22127365601280.0, + "grad_norm": 1.9599068485474205, + "language_loss": 0.80376077, + "learning_rate": 9.767054100752536e-07, + "loss": 0.82544082, + "num_input_tokens_seen": 121790720, + "step": 5662, + "time_per_iteration": 2.475688934326172 + }, + { + "auxiliary_loss_clip": 0.01128996, + "auxiliary_loss_mlp": 0.01024863, + "balance_loss_clip": 1.04531837, + "balance_loss_mlp": 1.01746607, + "epoch": 0.6809354896891722, + "flos": 17201822330880.0, + "grad_norm": 1.8482289044336226, + "language_loss": 0.81643987, + "learning_rate": 9.760361987119584e-07, + "loss": 0.83797848, + "num_input_tokens_seen": 121808455, + "step": 5663, + "time_per_iteration": 2.4713306427001953 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.04399204, + "balance_loss_mlp": 1.01951385, + "epoch": 0.6810557325798112, + "flos": 12458166554880.0, + "grad_norm": 1.8615498439985667, + "language_loss": 0.67311943, + "learning_rate": 9.753671426831592e-07, + "loss": 0.69479203, + "num_input_tokens_seen": 121824470, + "step": 5664, + "time_per_iteration": 2.43747878074646 + }, + { + "auxiliary_loss_clip": 0.01148724, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.04486561, + "balance_loss_mlp": 1.01945019, + "epoch": 0.6811759754704503, + "flos": 22156165330560.0, + "grad_norm": 1.7824829079284061, + "language_loss": 0.79699397, + "learning_rate": 9.746982420903483e-07, + "loss": 0.81874871, + "num_input_tokens_seen": 121842665, + "step": 5665, + "time_per_iteration": 2.4514105319976807 + }, + { + "auxiliary_loss_clip": 0.0115555, + "auxiliary_loss_mlp": 0.0102469, + "balance_loss_clip": 1.05116224, + "balance_loss_mlp": 1.0178926, + "epoch": 0.6812962183610894, + "flos": 17525377065600.0, + "grad_norm": 1.9972980751986613, + "language_loss": 0.74940825, + "learning_rate": 9.740294970349993e-07, + "loss": 0.77121067, + "num_input_tokens_seen": 121859080, + "step": 5666, + "time_per_iteration": 2.420055627822876 + }, + { + "auxiliary_loss_clip": 0.01049221, + "auxiliary_loss_mlp": 0.01001837, + "balance_loss_clip": 1.01485109, + "balance_loss_mlp": 1.00085402, + "epoch": 0.6814164612517285, + "flos": 60274480855680.0, + "grad_norm": 0.8743419920956141, + "language_loss": 0.60887706, + "learning_rate": 9.733609076185594e-07, + "loss": 0.62938762, + "num_input_tokens_seen": 121915485, + "step": 5667, + "time_per_iteration": 2.9459009170532227 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.04893303, + "balance_loss_mlp": 1.02233171, + "epoch": 0.6815367041423676, + "flos": 19317750750720.0, + "grad_norm": 1.7976735443505019, + "language_loss": 0.83713353, + "learning_rate": 9.72692473942455e-07, + "loss": 0.85898274, + "num_input_tokens_seen": 121932710, + "step": 5668, + "time_per_iteration": 2.4389798641204834 + }, + { + "auxiliary_loss_clip": 0.01120624, + "auxiliary_loss_mlp": 0.01024956, + "balance_loss_clip": 1.04871285, + "balance_loss_mlp": 1.01714134, + "epoch": 0.6816569470330067, + "flos": 22161696024960.0, + "grad_norm": 4.137210942081937, + "language_loss": 0.77749372, + "learning_rate": 9.720241961080849e-07, + "loss": 0.79894954, + "num_input_tokens_seen": 121952025, + "step": 5669, + "time_per_iteration": 3.269195556640625 + }, + { + "auxiliary_loss_clip": 0.01168316, + "auxiliary_loss_mlp": 0.01026068, + "balance_loss_clip": 1.04765201, + "balance_loss_mlp": 1.01866829, + "epoch": 0.6817771899236458, + "flos": 41463501137280.0, + "grad_norm": 2.2527367203781505, + "language_loss": 0.73023182, + "learning_rate": 9.713560742168259e-07, + "loss": 0.75217569, + "num_input_tokens_seen": 121974650, + "step": 5670, + "time_per_iteration": 2.5809009075164795 + }, + { + "auxiliary_loss_clip": 0.01125893, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.04355431, + "balance_loss_mlp": 1.02238309, + "epoch": 0.6818974328142848, + "flos": 21106138026240.0, + "grad_norm": 1.8757381891085267, + "language_loss": 0.71325529, + "learning_rate": 9.706881083700333e-07, + "loss": 0.73480815, + "num_input_tokens_seen": 121994335, + "step": 5671, + "time_per_iteration": 3.3486883640289307 + }, + { + "auxiliary_loss_clip": 0.01100251, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.04716277, + "balance_loss_mlp": 1.02052331, + "epoch": 0.682017675704924, + "flos": 20441897769600.0, + "grad_norm": 1.9867108097680781, + "language_loss": 0.82230723, + "learning_rate": 9.700202986690357e-07, + "loss": 0.84359366, + "num_input_tokens_seen": 122012635, + "step": 5672, + "time_per_iteration": 2.5564680099487305 + }, + { + "auxiliary_loss_clip": 0.0115358, + "auxiliary_loss_mlp": 0.00762533, + "balance_loss_clip": 1.04692459, + "balance_loss_mlp": 1.00061178, + "epoch": 0.682137918595563, + "flos": 20044438801920.0, + "grad_norm": 1.8920798058221098, + "language_loss": 0.66328001, + "learning_rate": 9.693526452151413e-07, + "loss": 0.68244117, + "num_input_tokens_seen": 122031685, + "step": 5673, + "time_per_iteration": 3.2407755851745605 + }, + { + "auxiliary_loss_clip": 0.01132798, + "auxiliary_loss_mlp": 0.01023499, + "balance_loss_clip": 1.04496288, + "balance_loss_mlp": 1.01537526, + "epoch": 0.6822581614862021, + "flos": 31684559063040.0, + "grad_norm": 1.697455899643641, + "language_loss": 0.75330949, + "learning_rate": 9.686851481096305e-07, + "loss": 0.77487242, + "num_input_tokens_seen": 122052995, + "step": 5674, + "time_per_iteration": 2.5931127071380615 + }, + { + "auxiliary_loss_clip": 0.01099548, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.04274535, + "balance_loss_mlp": 1.01905751, + "epoch": 0.6823784043768413, + "flos": 23477570864640.0, + "grad_norm": 1.8882355076551773, + "language_loss": 0.7173987, + "learning_rate": 9.68017807453762e-07, + "loss": 0.73866153, + "num_input_tokens_seen": 122071740, + "step": 5675, + "time_per_iteration": 2.5883944034576416 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.00762078, + "balance_loss_clip": 1.04831815, + "balance_loss_mlp": 1.00057149, + "epoch": 0.6824986472674803, + "flos": 14137134024960.0, + "grad_norm": 1.8058269271248202, + "language_loss": 0.72952092, + "learning_rate": 9.673506233487721e-07, + "loss": 0.74857843, + "num_input_tokens_seen": 122089705, + "step": 5676, + "time_per_iteration": 3.24983811378479 + }, + { + "auxiliary_loss_clip": 0.01141474, + "auxiliary_loss_mlp": 0.00761906, + "balance_loss_clip": 1.04461622, + "balance_loss_mlp": 1.00050855, + "epoch": 0.6826188901581194, + "flos": 21504997624320.0, + "grad_norm": 1.680776358043912, + "language_loss": 0.8608079, + "learning_rate": 9.666835958958717e-07, + "loss": 0.87984169, + "num_input_tokens_seen": 122109025, + "step": 5677, + "time_per_iteration": 2.4849982261657715 + }, + { + "auxiliary_loss_clip": 0.01167742, + "auxiliary_loss_mlp": 0.01020407, + "balance_loss_clip": 1.04891181, + "balance_loss_mlp": 1.01343036, + "epoch": 0.6827391330487584, + "flos": 20810126044800.0, + "grad_norm": 2.4835521219678673, + "language_loss": 0.80907482, + "learning_rate": 9.660167251962484e-07, + "loss": 0.83095634, + "num_input_tokens_seen": 122127385, + "step": 5678, + "time_per_iteration": 2.4099223613739014 + }, + { + "auxiliary_loss_clip": 0.01129298, + "auxiliary_loss_mlp": 0.01023543, + "balance_loss_clip": 1.04319572, + "balance_loss_mlp": 1.01692426, + "epoch": 0.6828593759393976, + "flos": 21688788539520.0, + "grad_norm": 10.157356116017443, + "language_loss": 0.77760178, + "learning_rate": 9.653500113510654e-07, + "loss": 0.7991302, + "num_input_tokens_seen": 122146500, + "step": 5679, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01134528, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.04295611, + "balance_loss_mlp": 1.02032137, + "epoch": 0.6829796188300367, + "flos": 25337707557120.0, + "grad_norm": 2.442300309763335, + "language_loss": 0.67099756, + "learning_rate": 9.646834544614627e-07, + "loss": 0.69262159, + "num_input_tokens_seen": 122167000, + "step": 5680, + "time_per_iteration": 2.537274122238159 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.04588199, + "balance_loss_mlp": 1.0193994, + "epoch": 0.6830998617206757, + "flos": 20704800389760.0, + "grad_norm": 1.7629091437102198, + "language_loss": 0.76264429, + "learning_rate": 9.64017054628558e-07, + "loss": 0.78423297, + "num_input_tokens_seen": 122185825, + "step": 5681, + "time_per_iteration": 2.467233419418335 + }, + { + "auxiliary_loss_clip": 0.01114737, + "auxiliary_loss_mlp": 0.01024885, + "balance_loss_clip": 1.04224157, + "balance_loss_mlp": 1.01790273, + "epoch": 0.6832201046113149, + "flos": 21726638496000.0, + "grad_norm": 1.7542470196537288, + "language_loss": 0.78997815, + "learning_rate": 9.63350811953441e-07, + "loss": 0.81137443, + "num_input_tokens_seen": 122206200, + "step": 5682, + "time_per_iteration": 2.5623183250427246 + }, + { + "auxiliary_loss_clip": 0.01127531, + "auxiliary_loss_mlp": 0.01022149, + "balance_loss_clip": 1.04386592, + "balance_loss_mlp": 1.01493406, + "epoch": 0.6833403475019539, + "flos": 19536554448000.0, + "grad_norm": 2.8896546595659363, + "language_loss": 0.7039206, + "learning_rate": 9.626847265371826e-07, + "loss": 0.72541738, + "num_input_tokens_seen": 122225520, + "step": 5683, + "time_per_iteration": 2.512218475341797 + }, + { + "auxiliary_loss_clip": 0.01131057, + "auxiliary_loss_mlp": 0.01025975, + "balance_loss_clip": 1.04320049, + "balance_loss_mlp": 1.01885772, + "epoch": 0.683460590392593, + "flos": 19352153001600.0, + "grad_norm": 2.0864397872367526, + "language_loss": 0.78533518, + "learning_rate": 9.620187984808262e-07, + "loss": 0.80690545, + "num_input_tokens_seen": 122244320, + "step": 5684, + "time_per_iteration": 2.486323833465576 + }, + { + "auxiliary_loss_clip": 0.01139136, + "auxiliary_loss_mlp": 0.00761744, + "balance_loss_clip": 1.04664588, + "balance_loss_mlp": 1.0005579, + "epoch": 0.6835808332832322, + "flos": 23288500650240.0, + "grad_norm": 1.610999900544913, + "language_loss": 0.85963929, + "learning_rate": 9.613530278853919e-07, + "loss": 0.8786481, + "num_input_tokens_seen": 122264295, + "step": 5685, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01155453, + "auxiliary_loss_mlp": 0.01023088, + "balance_loss_clip": 1.04990995, + "balance_loss_mlp": 1.01598287, + "epoch": 0.6837010761738712, + "flos": 21653416621440.0, + "grad_norm": 2.021045836357281, + "language_loss": 0.74426723, + "learning_rate": 9.60687414851879e-07, + "loss": 0.76605266, + "num_input_tokens_seen": 122285300, + "step": 5686, + "time_per_iteration": 2.4731569290161133 + }, + { + "auxiliary_loss_clip": 0.01143967, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.0489378, + "balance_loss_mlp": 1.01801658, + "epoch": 0.6838213190645103, + "flos": 17566387418880.0, + "grad_norm": 2.3009906422321733, + "language_loss": 0.77402008, + "learning_rate": 9.600219594812575e-07, + "loss": 0.79571319, + "num_input_tokens_seen": 122303240, + "step": 5687, + "time_per_iteration": 2.448185920715332 + }, + { + "auxiliary_loss_clip": 0.01167513, + "auxiliary_loss_mlp": 0.01024154, + "balance_loss_clip": 1.04848981, + "balance_loss_mlp": 1.01726413, + "epoch": 0.6839415619551494, + "flos": 23112538899840.0, + "grad_norm": 1.692974361196155, + "language_loss": 0.72805774, + "learning_rate": 9.593566618744786e-07, + "loss": 0.74997437, + "num_input_tokens_seen": 122323390, + "step": 5688, + "time_per_iteration": 2.4155213832855225 + }, + { + "auxiliary_loss_clip": 0.01167908, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.04748166, + "balance_loss_mlp": 1.01888442, + "epoch": 0.6840618048457885, + "flos": 22127868391680.0, + "grad_norm": 1.810355264466806, + "language_loss": 0.73995751, + "learning_rate": 9.58691522132466e-07, + "loss": 0.76189828, + "num_input_tokens_seen": 122342200, + "step": 5689, + "time_per_iteration": 2.4185643196105957 + }, + { + "auxiliary_loss_clip": 0.01146562, + "auxiliary_loss_mlp": 0.01025276, + "balance_loss_clip": 1.04885101, + "balance_loss_mlp": 1.01764023, + "epoch": 0.6841820477364275, + "flos": 22015898720640.0, + "grad_norm": 2.397166713152011, + "language_loss": 0.84819698, + "learning_rate": 9.58026540356123e-07, + "loss": 0.86991537, + "num_input_tokens_seen": 122360465, + "step": 5690, + "time_per_iteration": 2.4903764724731445 + }, + { + "auxiliary_loss_clip": 0.01155678, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.04545617, + "balance_loss_mlp": 1.01803267, + "epoch": 0.6843022906270667, + "flos": 24900531125760.0, + "grad_norm": 1.6046056119883991, + "language_loss": 0.86712223, + "learning_rate": 9.573617166463246e-07, + "loss": 0.88893247, + "num_input_tokens_seen": 122381680, + "step": 5691, + "time_per_iteration": 2.4692256450653076 + }, + { + "auxiliary_loss_clip": 0.01142201, + "auxiliary_loss_mlp": 0.01025309, + "balance_loss_clip": 1.04382288, + "balance_loss_mlp": 1.0188067, + "epoch": 0.6844225335177058, + "flos": 19969924037760.0, + "grad_norm": 3.558523135121771, + "language_loss": 0.60136461, + "learning_rate": 9.56697051103924e-07, + "loss": 0.62303966, + "num_input_tokens_seen": 122399120, + "step": 5692, + "time_per_iteration": 2.4762392044067383 + }, + { + "auxiliary_loss_clip": 0.01137827, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.04455853, + "balance_loss_mlp": 1.01884627, + "epoch": 0.6845427764083448, + "flos": 25883334126720.0, + "grad_norm": 2.2050805970703644, + "language_loss": 0.81205273, + "learning_rate": 9.560325438297522e-07, + "loss": 0.83369303, + "num_input_tokens_seen": 122417430, + "step": 5693, + "time_per_iteration": 2.4971835613250732 + }, + { + "auxiliary_loss_clip": 0.01144783, + "auxiliary_loss_mlp": 0.01023142, + "balance_loss_clip": 1.05141211, + "balance_loss_mlp": 1.01629364, + "epoch": 0.684663019298984, + "flos": 18880143356160.0, + "grad_norm": 1.7168429276221087, + "language_loss": 0.86630297, + "learning_rate": 9.553681949246127e-07, + "loss": 0.88798225, + "num_input_tokens_seen": 122435055, + "step": 5694, + "time_per_iteration": 2.5113637447357178 + }, + { + "auxiliary_loss_clip": 0.01130482, + "auxiliary_loss_mlp": 0.01026273, + "balance_loss_clip": 1.0452404, + "balance_loss_mlp": 1.01836371, + "epoch": 0.684783262189623, + "flos": 54193725302400.0, + "grad_norm": 1.962164242458883, + "language_loss": 0.74914157, + "learning_rate": 9.547040044892886e-07, + "loss": 0.77070916, + "num_input_tokens_seen": 122462570, + "step": 5695, + "time_per_iteration": 3.528773069381714 + }, + { + "auxiliary_loss_clip": 0.01062137, + "auxiliary_loss_mlp": 0.01000976, + "balance_loss_clip": 1.01544106, + "balance_loss_mlp": 0.9999153, + "epoch": 0.6849035050802621, + "flos": 63970264143360.0, + "grad_norm": 0.8630307867586, + "language_loss": 0.60133076, + "learning_rate": 9.540399726245354e-07, + "loss": 0.62196183, + "num_input_tokens_seen": 122519275, + "step": 5696, + "time_per_iteration": 2.92171573638916 + }, + { + "auxiliary_loss_clip": 0.01136223, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.04433775, + "balance_loss_mlp": 1.01846409, + "epoch": 0.6850237479709013, + "flos": 25224121774080.0, + "grad_norm": 1.6878599159076844, + "language_loss": 0.68831736, + "learning_rate": 9.533760994310859e-07, + "loss": 0.70994282, + "num_input_tokens_seen": 122539675, + "step": 5697, + "time_per_iteration": 2.5102057456970215 + }, + { + "auxiliary_loss_clip": 0.0116935, + "auxiliary_loss_mlp": 0.01024984, + "balance_loss_clip": 1.04903245, + "balance_loss_mlp": 1.01768196, + "epoch": 0.6851439908615403, + "flos": 19354128249600.0, + "grad_norm": 2.040418355002968, + "language_loss": 0.75328302, + "learning_rate": 9.527123850096508e-07, + "loss": 0.77522635, + "num_input_tokens_seen": 122558035, + "step": 5698, + "time_per_iteration": 3.246387004852295 + }, + { + "auxiliary_loss_clip": 0.01159818, + "auxiliary_loss_mlp": 0.01022887, + "balance_loss_clip": 1.04740846, + "balance_loss_mlp": 1.01582348, + "epoch": 0.6852642337521794, + "flos": 23182133500800.0, + "grad_norm": 1.8724907840886837, + "language_loss": 0.71947205, + "learning_rate": 9.520488294609142e-07, + "loss": 0.74129915, + "num_input_tokens_seen": 122576815, + "step": 5699, + "time_per_iteration": 2.447057008743286 + }, + { + "auxiliary_loss_clip": 0.01029632, + "auxiliary_loss_mlp": 0.01002322, + "balance_loss_clip": 1.01675105, + "balance_loss_mlp": 1.00125527, + "epoch": 0.6853844766428185, + "flos": 62647206583680.0, + "grad_norm": 0.7445548108741515, + "language_loss": 0.53871405, + "learning_rate": 9.513854328855368e-07, + "loss": 0.55903363, + "num_input_tokens_seen": 122634690, + "step": 5700, + "time_per_iteration": 3.9129080772399902 + }, + { + "auxiliary_loss_clip": 0.01167198, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.04985428, + "balance_loss_mlp": 1.01730883, + "epoch": 0.6855047195334576, + "flos": 23437242869760.0, + "grad_norm": 2.0281115486823693, + "language_loss": 0.81323457, + "learning_rate": 9.507221953841558e-07, + "loss": 0.83514977, + "num_input_tokens_seen": 122652320, + "step": 5701, + "time_per_iteration": 2.4152815341949463 + }, + { + "auxiliary_loss_clip": 0.01158487, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.05084181, + "balance_loss_mlp": 1.01668656, + "epoch": 0.6856249624240967, + "flos": 20664831530880.0, + "grad_norm": 1.5102995450698231, + "language_loss": 0.78113216, + "learning_rate": 9.500591170573824e-07, + "loss": 0.80296087, + "num_input_tokens_seen": 122672340, + "step": 5702, + "time_per_iteration": 2.4542086124420166 + }, + { + "auxiliary_loss_clip": 0.0110879, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.04199123, + "balance_loss_mlp": 1.01666617, + "epoch": 0.6857452053147358, + "flos": 17087302794240.0, + "grad_norm": 1.9622124326126513, + "language_loss": 0.74071407, + "learning_rate": 9.493961980058078e-07, + "loss": 0.76204097, + "num_input_tokens_seen": 122689935, + "step": 5703, + "time_per_iteration": 3.2859106063842773 + }, + { + "auxiliary_loss_clip": 0.01083469, + "auxiliary_loss_mlp": 0.0102494, + "balance_loss_clip": 1.03926325, + "balance_loss_mlp": 1.01776648, + "epoch": 0.6858654482053749, + "flos": 30847266057600.0, + "grad_norm": 1.9613972706816656, + "language_loss": 0.67700058, + "learning_rate": 9.48733438329993e-07, + "loss": 0.69808471, + "num_input_tokens_seen": 122710200, + "step": 5704, + "time_per_iteration": 2.638317823410034 + }, + { + "auxiliary_loss_clip": 0.01168297, + "auxiliary_loss_mlp": 0.00762092, + "balance_loss_clip": 1.05002117, + "balance_loss_mlp": 1.0005399, + "epoch": 0.6859856910960139, + "flos": 28877314510080.0, + "grad_norm": 1.7804988203379646, + "language_loss": 0.74381697, + "learning_rate": 9.480708381304807e-07, + "loss": 0.76312089, + "num_input_tokens_seen": 122731495, + "step": 5705, + "time_per_iteration": 2.4973254203796387 + }, + { + "auxiliary_loss_clip": 0.01109901, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.04580879, + "balance_loss_mlp": 1.02088904, + "epoch": 0.6861059339866531, + "flos": 19354523299200.0, + "grad_norm": 2.3999947672265005, + "language_loss": 0.83796859, + "learning_rate": 9.474083975077858e-07, + "loss": 0.85935223, + "num_input_tokens_seen": 122748620, + "step": 5706, + "time_per_iteration": 2.5197863578796387 + }, + { + "auxiliary_loss_clip": 0.01148622, + "auxiliary_loss_mlp": 0.01022976, + "balance_loss_clip": 1.04636383, + "balance_loss_mlp": 1.01590347, + "epoch": 0.6862261768772921, + "flos": 22199976944640.0, + "grad_norm": 9.835527263764096, + "language_loss": 0.80111837, + "learning_rate": 9.467461165623994e-07, + "loss": 0.82283431, + "num_input_tokens_seen": 122767670, + "step": 5707, + "time_per_iteration": 2.442981004714966 + }, + { + "auxiliary_loss_clip": 0.01158062, + "auxiliary_loss_mlp": 0.01022686, + "balance_loss_clip": 1.04757833, + "balance_loss_mlp": 1.01580453, + "epoch": 0.6863464197679312, + "flos": 26285677344000.0, + "grad_norm": 2.144117148262155, + "language_loss": 0.79662561, + "learning_rate": 9.46083995394791e-07, + "loss": 0.81843305, + "num_input_tokens_seen": 122785480, + "step": 5708, + "time_per_iteration": 2.5060737133026123 + }, + { + "auxiliary_loss_clip": 0.01155173, + "auxiliary_loss_mlp": 0.00761378, + "balance_loss_clip": 1.04716969, + "balance_loss_mlp": 1.00051546, + "epoch": 0.6864666626585703, + "flos": 37815228564480.0, + "grad_norm": 2.481138689271586, + "language_loss": 0.63409549, + "learning_rate": 9.454220341054012e-07, + "loss": 0.65326095, + "num_input_tokens_seen": 122810265, + "step": 5709, + "time_per_iteration": 2.598345994949341 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.04328811, + "balance_loss_mlp": 1.02247143, + "epoch": 0.6865869055492094, + "flos": 19391152193280.0, + "grad_norm": 1.9070596138514655, + "language_loss": 0.80716562, + "learning_rate": 9.447602327946512e-07, + "loss": 0.82869542, + "num_input_tokens_seen": 122828905, + "step": 5710, + "time_per_iteration": 2.4986867904663086 + }, + { + "auxiliary_loss_clip": 0.01139256, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.0451231, + "balance_loss_mlp": 1.01943982, + "epoch": 0.6867071484398485, + "flos": 20375966355840.0, + "grad_norm": 1.8066430502567807, + "language_loss": 0.76586872, + "learning_rate": 9.440985915629338e-07, + "loss": 0.7875309, + "num_input_tokens_seen": 122846235, + "step": 5711, + "time_per_iteration": 2.4718642234802246 + }, + { + "auxiliary_loss_clip": 0.01168395, + "auxiliary_loss_mlp": 0.01024389, + "balance_loss_clip": 1.05066097, + "balance_loss_mlp": 1.01735544, + "epoch": 0.6868273913304875, + "flos": 15889143801600.0, + "grad_norm": 2.196203337506372, + "language_loss": 0.72919637, + "learning_rate": 9.434371105106223e-07, + "loss": 0.7511242, + "num_input_tokens_seen": 122863835, + "step": 5712, + "time_per_iteration": 2.3857431411743164 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.01026019, + "balance_loss_clip": 1.04348886, + "balance_loss_mlp": 1.01850295, + "epoch": 0.6869476342211267, + "flos": 24462492768000.0, + "grad_norm": 1.7184899821878972, + "language_loss": 0.70556158, + "learning_rate": 9.427757897380602e-07, + "loss": 0.72705644, + "num_input_tokens_seen": 122883235, + "step": 5713, + "time_per_iteration": 2.5473453998565674 + }, + { + "auxiliary_loss_clip": 0.0112479, + "auxiliary_loss_mlp": 0.01022913, + "balance_loss_clip": 1.04569185, + "balance_loss_mlp": 1.01515865, + "epoch": 0.6870678771117658, + "flos": 18442571875200.0, + "grad_norm": 2.286378730540375, + "language_loss": 0.84600478, + "learning_rate": 9.421146293455695e-07, + "loss": 0.86748171, + "num_input_tokens_seen": 122898975, + "step": 5714, + "time_per_iteration": 2.5033187866210938 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.04431129, + "balance_loss_mlp": 1.01693487, + "epoch": 0.6871881200024048, + "flos": 22200371994240.0, + "grad_norm": 1.788128408753099, + "language_loss": 0.68465519, + "learning_rate": 9.414536294334489e-07, + "loss": 0.70627904, + "num_input_tokens_seen": 122918995, + "step": 5715, + "time_per_iteration": 2.5033304691314697 + }, + { + "auxiliary_loss_clip": 0.01141822, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.04271805, + "balance_loss_mlp": 1.01810789, + "epoch": 0.687308362893044, + "flos": 22127724737280.0, + "grad_norm": 2.006388834957188, + "language_loss": 0.6968714, + "learning_rate": 9.407927901019708e-07, + "loss": 0.7185477, + "num_input_tokens_seen": 122938125, + "step": 5716, + "time_per_iteration": 2.4821105003356934 + }, + { + "auxiliary_loss_clip": 0.01154673, + "auxiliary_loss_mlp": 0.01023301, + "balance_loss_clip": 1.047176, + "balance_loss_mlp": 1.01627409, + "epoch": 0.687428605783683, + "flos": 25040546340480.0, + "grad_norm": 2.7207932885444213, + "language_loss": 0.76633137, + "learning_rate": 9.401321114513854e-07, + "loss": 0.78811109, + "num_input_tokens_seen": 122957020, + "step": 5717, + "time_per_iteration": 2.4805705547332764 + }, + { + "auxiliary_loss_clip": 0.01170348, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.04962957, + "balance_loss_mlp": 1.0198853, + "epoch": 0.6875488486743221, + "flos": 23770063313280.0, + "grad_norm": 1.8699197166872108, + "language_loss": 0.74892223, + "learning_rate": 9.394715935819155e-07, + "loss": 0.7708987, + "num_input_tokens_seen": 122977410, + "step": 5718, + "time_per_iteration": 2.457313060760498 + }, + { + "auxiliary_loss_clip": 0.01158266, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.04766273, + "balance_loss_mlp": 1.0207293, + "epoch": 0.6876690915649613, + "flos": 25516937445120.0, + "grad_norm": 3.5285869913030243, + "language_loss": 0.61999249, + "learning_rate": 9.388112365937608e-07, + "loss": 0.64185417, + "num_input_tokens_seen": 122996875, + "step": 5719, + "time_per_iteration": 2.4832680225372314 + }, + { + "auxiliary_loss_clip": 0.01127298, + "auxiliary_loss_mlp": 0.01022468, + "balance_loss_clip": 1.04493904, + "balance_loss_mlp": 1.014925, + "epoch": 0.6877893344556003, + "flos": 19427996568960.0, + "grad_norm": 2.416596294166508, + "language_loss": 0.82506192, + "learning_rate": 9.381510405870985e-07, + "loss": 0.84655952, + "num_input_tokens_seen": 123015890, + "step": 5720, + "time_per_iteration": 2.497011661529541 + }, + { + "auxiliary_loss_clip": 0.01154961, + "auxiliary_loss_mlp": 0.01024888, + "balance_loss_clip": 1.0477922, + "balance_loss_mlp": 1.01751471, + "epoch": 0.6879095773462394, + "flos": 18661303745280.0, + "grad_norm": 2.97680934093026, + "language_loss": 0.77138931, + "learning_rate": 9.374910056620791e-07, + "loss": 0.79318774, + "num_input_tokens_seen": 123034955, + "step": 5721, + "time_per_iteration": 2.4259798526763916 + }, + { + "auxiliary_loss_clip": 0.01156637, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.04807711, + "balance_loss_mlp": 1.02099884, + "epoch": 0.6880298202368785, + "flos": 20883132437760.0, + "grad_norm": 2.298995768917676, + "language_loss": 0.80937731, + "learning_rate": 9.368311319188293e-07, + "loss": 0.83122969, + "num_input_tokens_seen": 123052770, + "step": 5722, + "time_per_iteration": 3.2340428829193115 + }, + { + "auxiliary_loss_clip": 0.01126149, + "auxiliary_loss_mlp": 0.01023562, + "balance_loss_clip": 1.0438261, + "balance_loss_mlp": 1.01648116, + "epoch": 0.6881500631275176, + "flos": 30153292318080.0, + "grad_norm": 1.6718056376974701, + "language_loss": 0.7943576, + "learning_rate": 9.361714194574515e-07, + "loss": 0.81585473, + "num_input_tokens_seen": 123075105, + "step": 5723, + "time_per_iteration": 2.5868897438049316 + }, + { + "auxiliary_loss_clip": 0.01071363, + "auxiliary_loss_mlp": 0.01001711, + "balance_loss_clip": 1.01588881, + "balance_loss_mlp": 1.00064361, + "epoch": 0.6882703060181566, + "flos": 66181537215360.0, + "grad_norm": 0.730133418288036, + "language_loss": 0.58289182, + "learning_rate": 9.355118683780228e-07, + "loss": 0.60362256, + "num_input_tokens_seen": 123145175, + "step": 5724, + "time_per_iteration": 3.102224349975586 + }, + { + "auxiliary_loss_clip": 0.01167932, + "auxiliary_loss_mlp": 0.01025853, + "balance_loss_clip": 1.04799485, + "balance_loss_mlp": 1.01851535, + "epoch": 0.6883905489087958, + "flos": 18214646123520.0, + "grad_norm": 2.096248042270591, + "language_loss": 0.79752886, + "learning_rate": 9.348524787805987e-07, + "loss": 0.81946671, + "num_input_tokens_seen": 123160365, + "step": 5725, + "time_per_iteration": 3.5191938877105713 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.010215, + "balance_loss_clip": 1.04172623, + "balance_loss_mlp": 1.01447535, + "epoch": 0.6885107917994349, + "flos": 14056262553600.0, + "grad_norm": 4.922032093977957, + "language_loss": 0.8538748, + "learning_rate": 9.341932507652053e-07, + "loss": 0.87538081, + "num_input_tokens_seen": 123174855, + "step": 5726, + "time_per_iteration": 2.519352436065674 + }, + { + "auxiliary_loss_clip": 0.01138951, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.04224718, + "balance_loss_mlp": 1.01955998, + "epoch": 0.6886310346900739, + "flos": 28690722334080.0, + "grad_norm": 1.905341977876921, + "language_loss": 0.78584218, + "learning_rate": 9.335341844318489e-07, + "loss": 0.80751044, + "num_input_tokens_seen": 123194995, + "step": 5727, + "time_per_iteration": 3.290689468383789 + }, + { + "auxiliary_loss_clip": 0.01140837, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.04652655, + "balance_loss_mlp": 1.01994681, + "epoch": 0.6887512775807131, + "flos": 24535319592960.0, + "grad_norm": 1.8871737465022373, + "language_loss": 0.73136538, + "learning_rate": 9.328752798805091e-07, + "loss": 0.75304592, + "num_input_tokens_seen": 123213465, + "step": 5728, + "time_per_iteration": 2.5016701221466064 + }, + { + "auxiliary_loss_clip": 0.01154397, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.04627395, + "balance_loss_mlp": 1.0199703, + "epoch": 0.6888715204713521, + "flos": 22414363269120.0, + "grad_norm": 2.2402181236994965, + "language_loss": 0.76077914, + "learning_rate": 9.322165372111399e-07, + "loss": 0.78259498, + "num_input_tokens_seen": 123231610, + "step": 5729, + "time_per_iteration": 3.2580041885375977 + }, + { + "auxiliary_loss_clip": 0.01123636, + "auxiliary_loss_mlp": 0.0102226, + "balance_loss_clip": 1.04590964, + "balance_loss_mlp": 1.01513469, + "epoch": 0.6889917633619912, + "flos": 22054323294720.0, + "grad_norm": 1.9574166982418773, + "language_loss": 0.75711697, + "learning_rate": 9.315579565236747e-07, + "loss": 0.7785759, + "num_input_tokens_seen": 123250715, + "step": 5730, + "time_per_iteration": 2.5145270824432373 + }, + { + "auxiliary_loss_clip": 0.01140609, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.0505271, + "balance_loss_mlp": 1.02004063, + "epoch": 0.6891120062526304, + "flos": 23949724164480.0, + "grad_norm": 1.6966003496311692, + "language_loss": 0.74528551, + "learning_rate": 9.308995379180162e-07, + "loss": 0.7669723, + "num_input_tokens_seen": 123270270, + "step": 5731, + "time_per_iteration": 2.5157313346862793 + }, + { + "auxiliary_loss_clip": 0.01061384, + "auxiliary_loss_mlp": 0.0100106, + "balance_loss_clip": 1.01502061, + "balance_loss_mlp": 1.00006509, + "epoch": 0.6892322491432694, + "flos": 64117354337280.0, + "grad_norm": 0.7454223445601498, + "language_loss": 0.59533644, + "learning_rate": 9.302412814940488e-07, + "loss": 0.61596096, + "num_input_tokens_seen": 123333045, + "step": 5732, + "time_per_iteration": 3.0954742431640625 + }, + { + "auxiliary_loss_clip": 0.01138511, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.04325151, + "balance_loss_mlp": 1.01687288, + "epoch": 0.6893524920339085, + "flos": 23002436736000.0, + "grad_norm": 2.053368614831796, + "language_loss": 0.71156657, + "learning_rate": 9.295831873516276e-07, + "loss": 0.73319906, + "num_input_tokens_seen": 123352320, + "step": 5733, + "time_per_iteration": 2.5057895183563232 + }, + { + "auxiliary_loss_clip": 0.01167235, + "auxiliary_loss_mlp": 0.01025925, + "balance_loss_clip": 1.0490464, + "balance_loss_mlp": 1.01843834, + "epoch": 0.6894727349245476, + "flos": 21396260177280.0, + "grad_norm": 1.7618763236820842, + "language_loss": 0.7615453, + "learning_rate": 9.289252555905873e-07, + "loss": 0.78347695, + "num_input_tokens_seen": 123372400, + "step": 5734, + "time_per_iteration": 2.433708667755127 + }, + { + "auxiliary_loss_clip": 0.01155747, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_clip": 1.04883313, + "balance_loss_mlp": 1.01529956, + "epoch": 0.6895929778151867, + "flos": 19865316654720.0, + "grad_norm": 2.044291818992456, + "language_loss": 0.76015317, + "learning_rate": 9.282674863107334e-07, + "loss": 0.7819382, + "num_input_tokens_seen": 123390215, + "step": 5735, + "time_per_iteration": 2.4485487937927246 + }, + { + "auxiliary_loss_clip": 0.01151006, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.04805565, + "balance_loss_mlp": 1.02006578, + "epoch": 0.6897132207058257, + "flos": 18179166464640.0, + "grad_norm": 2.3627322131421318, + "language_loss": 0.7575624, + "learning_rate": 9.276098796118488e-07, + "loss": 0.77934575, + "num_input_tokens_seen": 123406700, + "step": 5736, + "time_per_iteration": 2.4083616733551025 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01024075, + "balance_loss_clip": 1.04860759, + "balance_loss_mlp": 1.01702714, + "epoch": 0.6898334635964649, + "flos": 32561641359360.0, + "grad_norm": 1.7887429098307388, + "language_loss": 0.66648114, + "learning_rate": 9.269524355936938e-07, + "loss": 0.6881429, + "num_input_tokens_seen": 123429880, + "step": 5737, + "time_per_iteration": 2.567242383956909 + }, + { + "auxiliary_loss_clip": 0.01135014, + "auxiliary_loss_mlp": 0.01022752, + "balance_loss_clip": 1.04405141, + "balance_loss_mlp": 1.01578689, + "epoch": 0.689953706487104, + "flos": 22819004956800.0, + "grad_norm": 1.7042803989756066, + "language_loss": 0.84831685, + "learning_rate": 9.262951543560002e-07, + "loss": 0.8698945, + "num_input_tokens_seen": 123449105, + "step": 5738, + "time_per_iteration": 2.5031094551086426 + }, + { + "auxiliary_loss_clip": 0.01142445, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.05028462, + "balance_loss_mlp": 1.02417505, + "epoch": 0.690073949377743, + "flos": 18515362786560.0, + "grad_norm": 2.1839234862831414, + "language_loss": 0.86339325, + "learning_rate": 9.256380359984795e-07, + "loss": 0.88513589, + "num_input_tokens_seen": 123466215, + "step": 5739, + "time_per_iteration": 2.4452052116394043 + }, + { + "auxiliary_loss_clip": 0.01116626, + "auxiliary_loss_mlp": 0.01026186, + "balance_loss_clip": 1.03886485, + "balance_loss_mlp": 1.01899123, + "epoch": 0.6901941922683821, + "flos": 34857194716800.0, + "grad_norm": 2.154249940638445, + "language_loss": 0.75074643, + "learning_rate": 9.249810806208139e-07, + "loss": 0.77217454, + "num_input_tokens_seen": 123485480, + "step": 5740, + "time_per_iteration": 2.664944648742676 + }, + { + "auxiliary_loss_clip": 0.01109315, + "auxiliary_loss_mlp": 0.00761566, + "balance_loss_clip": 1.03845358, + "balance_loss_mlp": 1.00059927, + "epoch": 0.6903144351590212, + "flos": 16253672976000.0, + "grad_norm": 1.8945977996530499, + "language_loss": 0.80364668, + "learning_rate": 9.243242883226627e-07, + "loss": 0.82235551, + "num_input_tokens_seen": 123504575, + "step": 5741, + "time_per_iteration": 2.528092384338379 + }, + { + "auxiliary_loss_clip": 0.01156939, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.04451704, + "balance_loss_mlp": 1.02010906, + "epoch": 0.6904346780496603, + "flos": 28035137255040.0, + "grad_norm": 2.8932824773119874, + "language_loss": 0.69639575, + "learning_rate": 9.236676592036628e-07, + "loss": 0.71824741, + "num_input_tokens_seen": 123524250, + "step": 5742, + "time_per_iteration": 2.5079591274261475 + }, + { + "auxiliary_loss_clip": 0.01138907, + "auxiliary_loss_mlp": 0.01023325, + "balance_loss_clip": 1.04899776, + "balance_loss_mlp": 1.01615787, + "epoch": 0.6905549209402994, + "flos": 23624266008960.0, + "grad_norm": 1.7007250525889752, + "language_loss": 0.73777354, + "learning_rate": 9.230111933634228e-07, + "loss": 0.75939584, + "num_input_tokens_seen": 123545845, + "step": 5743, + "time_per_iteration": 2.5137429237365723 + }, + { + "auxiliary_loss_clip": 0.01158403, + "auxiliary_loss_mlp": 0.01022376, + "balance_loss_clip": 1.05024183, + "balance_loss_mlp": 1.0153966, + "epoch": 0.6906751638309385, + "flos": 23114945111040.0, + "grad_norm": 1.6831624748905776, + "language_loss": 0.80760562, + "learning_rate": 9.223548909015288e-07, + "loss": 0.82941341, + "num_input_tokens_seen": 123567535, + "step": 5744, + "time_per_iteration": 2.487330436706543 + }, + { + "auxiliary_loss_clip": 0.01103924, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.03928256, + "balance_loss_mlp": 1.01975083, + "epoch": 0.6907954067215776, + "flos": 27305468375040.0, + "grad_norm": 1.818234544003298, + "language_loss": 0.72195995, + "learning_rate": 9.216987519175407e-07, + "loss": 0.74326849, + "num_input_tokens_seen": 123587710, + "step": 5745, + "time_per_iteration": 2.6132993698120117 + }, + { + "auxiliary_loss_clip": 0.01147898, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_clip": 1.04651523, + "balance_loss_mlp": 1.01620674, + "epoch": 0.6909156496122166, + "flos": 21689399070720.0, + "grad_norm": 1.7788387209152363, + "language_loss": 0.6858052, + "learning_rate": 9.210427765109942e-07, + "loss": 0.70751739, + "num_input_tokens_seen": 123607385, + "step": 5746, + "time_per_iteration": 2.438671112060547 + }, + { + "auxiliary_loss_clip": 0.0113983, + "auxiliary_loss_mlp": 0.01025206, + "balance_loss_clip": 1.04378462, + "balance_loss_mlp": 1.0173533, + "epoch": 0.6910358925028558, + "flos": 22561453463040.0, + "grad_norm": 2.953691675612746, + "language_loss": 0.81158715, + "learning_rate": 9.20386964781402e-07, + "loss": 0.83323753, + "num_input_tokens_seen": 123625405, + "step": 5747, + "time_per_iteration": 2.4744808673858643 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01023128, + "balance_loss_clip": 1.04490495, + "balance_loss_mlp": 1.01598191, + "epoch": 0.6911561353934949, + "flos": 22054107813120.0, + "grad_norm": 1.849350440588227, + "language_loss": 0.84274703, + "learning_rate": 9.197313168282472e-07, + "loss": 0.86434281, + "num_input_tokens_seen": 123642850, + "step": 5748, + "time_per_iteration": 2.4859039783477783 + }, + { + "auxiliary_loss_clip": 0.01148937, + "auxiliary_loss_mlp": 0.01024622, + "balance_loss_clip": 1.04379201, + "balance_loss_mlp": 1.01743412, + "epoch": 0.6912763782841339, + "flos": 24206557386240.0, + "grad_norm": 2.2064091290287178, + "language_loss": 0.7202186, + "learning_rate": 9.190758327509935e-07, + "loss": 0.74195421, + "num_input_tokens_seen": 123661595, + "step": 5749, + "time_per_iteration": 3.27254581451416 + }, + { + "auxiliary_loss_clip": 0.01032514, + "auxiliary_loss_mlp": 0.00752863, + "balance_loss_clip": 1.01442003, + "balance_loss_mlp": 0.99984324, + "epoch": 0.6913966211747731, + "flos": 52329641091840.0, + "grad_norm": 0.932012425441419, + "language_loss": 0.64492482, + "learning_rate": 9.184205126490767e-07, + "loss": 0.66277862, + "num_input_tokens_seen": 123710490, + "step": 5750, + "time_per_iteration": 2.953484296798706 + }, + { + "auxiliary_loss_clip": 0.01038958, + "auxiliary_loss_mlp": 0.00752811, + "balance_loss_clip": 1.0143528, + "balance_loss_mlp": 0.99991363, + "epoch": 0.6915168640654121, + "flos": 66741274851840.0, + "grad_norm": 1.0796555288360798, + "language_loss": 0.59702909, + "learning_rate": 9.177653566219075e-07, + "loss": 0.61494684, + "num_input_tokens_seen": 123765215, + "step": 5751, + "time_per_iteration": 3.8127617835998535 + }, + { + "auxiliary_loss_clip": 0.01129051, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.04291868, + "balance_loss_mlp": 1.02030134, + "epoch": 0.6916371069560512, + "flos": 18296523175680.0, + "grad_norm": 2.7052514069274753, + "language_loss": 0.762353, + "learning_rate": 9.171103647688744e-07, + "loss": 0.78391892, + "num_input_tokens_seen": 123783955, + "step": 5752, + "time_per_iteration": 2.5005910396575928 + }, + { + "auxiliary_loss_clip": 0.01075087, + "auxiliary_loss_mlp": 0.01024856, + "balance_loss_clip": 1.03978825, + "balance_loss_mlp": 1.01806748, + "epoch": 0.6917573498466904, + "flos": 19645794685440.0, + "grad_norm": 1.930178206733818, + "language_loss": 0.68999827, + "learning_rate": 9.164555371893367e-07, + "loss": 0.7109977, + "num_input_tokens_seen": 123803885, + "step": 5753, + "time_per_iteration": 2.6045751571655273 + }, + { + "auxiliary_loss_clip": 0.01155808, + "auxiliary_loss_mlp": 0.00761603, + "balance_loss_clip": 1.04900646, + "balance_loss_mlp": 1.00051844, + "epoch": 0.6918775927373294, + "flos": 14210319985920.0, + "grad_norm": 2.094529852681535, + "language_loss": 0.7520963, + "learning_rate": 9.158008739826333e-07, + "loss": 0.77127039, + "num_input_tokens_seen": 123821485, + "step": 5754, + "time_per_iteration": 3.257155418395996 + }, + { + "auxiliary_loss_clip": 0.01138581, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.04726052, + "balance_loss_mlp": 1.01980901, + "epoch": 0.6919978356279685, + "flos": 23985455218560.0, + "grad_norm": 1.5857175985167324, + "language_loss": 0.86595976, + "learning_rate": 9.151463752480744e-07, + "loss": 0.8876183, + "num_input_tokens_seen": 123840215, + "step": 5755, + "time_per_iteration": 2.501033306121826 + }, + { + "auxiliary_loss_clip": 0.01115205, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.04217076, + "balance_loss_mlp": 1.01991141, + "epoch": 0.6921180785186076, + "flos": 23622937205760.0, + "grad_norm": 1.6344181757249778, + "language_loss": 0.80368906, + "learning_rate": 9.144920410849493e-07, + "loss": 0.82511234, + "num_input_tokens_seen": 123861450, + "step": 5756, + "time_per_iteration": 3.2965164184570312 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.04586649, + "balance_loss_mlp": 1.01920414, + "epoch": 0.6922383214092467, + "flos": 21142623265920.0, + "grad_norm": 1.6706082176997474, + "language_loss": 0.80647415, + "learning_rate": 9.138378715925176e-07, + "loss": 0.82818443, + "num_input_tokens_seen": 123880545, + "step": 5757, + "time_per_iteration": 2.5535547733306885 + }, + { + "auxiliary_loss_clip": 0.01134742, + "auxiliary_loss_mlp": 0.010227, + "balance_loss_clip": 1.04456425, + "balance_loss_mlp": 1.01561022, + "epoch": 0.6923585642998857, + "flos": 21470667200640.0, + "grad_norm": 1.5756804331649827, + "language_loss": 0.80963147, + "learning_rate": 9.131838668700167e-07, + "loss": 0.8312059, + "num_input_tokens_seen": 123900615, + "step": 5758, + "time_per_iteration": 2.4891936779022217 + }, + { + "auxiliary_loss_clip": 0.01126833, + "auxiliary_loss_mlp": 0.01024972, + "balance_loss_clip": 1.04350471, + "balance_loss_mlp": 1.01792026, + "epoch": 0.6924788071905249, + "flos": 21105204272640.0, + "grad_norm": 1.9790880702226041, + "language_loss": 0.86584604, + "learning_rate": 9.125300270166598e-07, + "loss": 0.88736403, + "num_input_tokens_seen": 123921220, + "step": 5759, + "time_per_iteration": 2.544259548187256 + }, + { + "auxiliary_loss_clip": 0.01133819, + "auxiliary_loss_mlp": 0.01020283, + "balance_loss_clip": 1.04431319, + "balance_loss_mlp": 1.01279998, + "epoch": 0.692599050081164, + "flos": 26250018117120.0, + "grad_norm": 1.7142491767918437, + "language_loss": 0.85845792, + "learning_rate": 9.118763521316324e-07, + "loss": 0.87999892, + "num_input_tokens_seen": 123941795, + "step": 5760, + "time_per_iteration": 2.5486650466918945 + }, + { + "auxiliary_loss_clip": 0.01168502, + "auxiliary_loss_mlp": 0.00762082, + "balance_loss_clip": 1.04765296, + "balance_loss_mlp": 1.00051713, + "epoch": 0.692719292971803, + "flos": 20885215426560.0, + "grad_norm": 2.4385243358059463, + "language_loss": 0.76098591, + "learning_rate": 9.112228423140987e-07, + "loss": 0.7802918, + "num_input_tokens_seen": 123960715, + "step": 5761, + "time_per_iteration": 2.4318244457244873 + }, + { + "auxiliary_loss_clip": 0.011448, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.04634619, + "balance_loss_mlp": 1.02286148, + "epoch": 0.6928395358624422, + "flos": 25921938268800.0, + "grad_norm": 6.885428835020421, + "language_loss": 0.86322641, + "learning_rate": 9.105694976631932e-07, + "loss": 0.88498151, + "num_input_tokens_seen": 123978625, + "step": 5762, + "time_per_iteration": 2.5106899738311768 + }, + { + "auxiliary_loss_clip": 0.01153497, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.04892206, + "balance_loss_mlp": 1.02087808, + "epoch": 0.6929597787530812, + "flos": 23586559706880.0, + "grad_norm": 2.375343916997738, + "language_loss": 0.72661793, + "learning_rate": 9.099163182780283e-07, + "loss": 0.74843597, + "num_input_tokens_seen": 123996780, + "step": 5763, + "time_per_iteration": 2.468513011932373 + }, + { + "auxiliary_loss_clip": 0.01136672, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.04610741, + "balance_loss_mlp": 1.01919961, + "epoch": 0.6930800216437203, + "flos": 18255656476800.0, + "grad_norm": 3.429194481011056, + "language_loss": 0.49409071, + "learning_rate": 9.092633042576916e-07, + "loss": 0.51572758, + "num_input_tokens_seen": 124014045, + "step": 5764, + "time_per_iteration": 2.4508748054504395 + }, + { + "auxiliary_loss_clip": 0.0113584, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.04598045, + "balance_loss_mlp": 1.02026212, + "epoch": 0.6932002645343595, + "flos": 29168621809920.0, + "grad_norm": 1.7174633994847568, + "language_loss": 0.56394851, + "learning_rate": 9.086104557012446e-07, + "loss": 0.58558202, + "num_input_tokens_seen": 124034615, + "step": 5765, + "time_per_iteration": 2.5372023582458496 + }, + { + "auxiliary_loss_clip": 0.01145246, + "auxiliary_loss_mlp": 0.01021772, + "balance_loss_clip": 1.04603648, + "balance_loss_mlp": 1.01473331, + "epoch": 0.6933205074249985, + "flos": 23842746483840.0, + "grad_norm": 1.8192545164636547, + "language_loss": 0.65292811, + "learning_rate": 9.079577727077239e-07, + "loss": 0.67459834, + "num_input_tokens_seen": 124053445, + "step": 5766, + "time_per_iteration": 2.469566822052002 + }, + { + "auxiliary_loss_clip": 0.01156079, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.04863262, + "balance_loss_mlp": 1.02127945, + "epoch": 0.6934407503156376, + "flos": 24166696268160.0, + "grad_norm": 3.126753868699411, + "language_loss": 0.71937448, + "learning_rate": 9.073052553761404e-07, + "loss": 0.741225, + "num_input_tokens_seen": 124072810, + "step": 5767, + "time_per_iteration": 2.4635894298553467 + }, + { + "auxiliary_loss_clip": 0.01116461, + "auxiliary_loss_mlp": 0.01024929, + "balance_loss_clip": 1.04527593, + "balance_loss_mlp": 1.01679897, + "epoch": 0.6935609932062767, + "flos": 20631327120000.0, + "grad_norm": 1.6614036675546122, + "language_loss": 0.78268957, + "learning_rate": 9.066529038054805e-07, + "loss": 0.80410349, + "num_input_tokens_seen": 124092875, + "step": 5768, + "time_per_iteration": 2.5495917797088623 + }, + { + "auxiliary_loss_clip": 0.01138751, + "auxiliary_loss_mlp": 0.01022071, + "balance_loss_clip": 1.04660332, + "balance_loss_mlp": 1.01490331, + "epoch": 0.6936812360969158, + "flos": 18254184019200.0, + "grad_norm": 1.815154152159682, + "language_loss": 0.74178845, + "learning_rate": 9.060007180947071e-07, + "loss": 0.76339662, + "num_input_tokens_seen": 124110930, + "step": 5769, + "time_per_iteration": 2.4599761962890625 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01027169, + "balance_loss_clip": 1.03906393, + "balance_loss_mlp": 1.01951933, + "epoch": 0.6938014789875548, + "flos": 31317336368640.0, + "grad_norm": 1.851476358020392, + "language_loss": 0.73212183, + "learning_rate": 9.053486983427534e-07, + "loss": 0.75350475, + "num_input_tokens_seen": 124132180, + "step": 5770, + "time_per_iteration": 2.6224005222320557 + }, + { + "auxiliary_loss_clip": 0.01142913, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.04412901, + "balance_loss_mlp": 1.01899552, + "epoch": 0.6939217218781939, + "flos": 17528429721600.0, + "grad_norm": 2.009702779252152, + "language_loss": 0.7055375, + "learning_rate": 9.046968446485326e-07, + "loss": 0.72723079, + "num_input_tokens_seen": 124150585, + "step": 5771, + "time_per_iteration": 2.468140125274658 + }, + { + "auxiliary_loss_clip": 0.01157962, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.04979205, + "balance_loss_mlp": 1.02054238, + "epoch": 0.6940419647688331, + "flos": 18551776199040.0, + "grad_norm": 8.092004430525884, + "language_loss": 0.70643914, + "learning_rate": 9.040451571109295e-07, + "loss": 0.72830558, + "num_input_tokens_seen": 124166205, + "step": 5772, + "time_per_iteration": 2.4313132762908936 + }, + { + "auxiliary_loss_clip": 0.01040063, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.02045226, + "balance_loss_mlp": 1.00154102, + "epoch": 0.6941622076594721, + "flos": 66926286829440.0, + "grad_norm": 0.832527170160163, + "language_loss": 0.6037572, + "learning_rate": 9.033936358288042e-07, + "loss": 0.62418413, + "num_input_tokens_seen": 124219940, + "step": 5773, + "time_per_iteration": 2.988834857940674 + }, + { + "auxiliary_loss_clip": 0.01170381, + "auxiliary_loss_mlp": 0.01016035, + "balance_loss_clip": 1.04958415, + "balance_loss_mlp": 1.00869155, + "epoch": 0.6942824505501112, + "flos": 26578062051840.0, + "grad_norm": 1.6914707995046343, + "language_loss": 0.82178885, + "learning_rate": 9.027422809009937e-07, + "loss": 0.84365302, + "num_input_tokens_seen": 124239885, + "step": 5774, + "time_per_iteration": 2.4825809001922607 + }, + { + "auxiliary_loss_clip": 0.01155152, + "auxiliary_loss_mlp": 0.0102025, + "balance_loss_clip": 1.04539442, + "balance_loss_mlp": 1.01265001, + "epoch": 0.6944026934407503, + "flos": 21248308056960.0, + "grad_norm": 3.8448102472157353, + "language_loss": 0.83407629, + "learning_rate": 9.020910924263054e-07, + "loss": 0.85583031, + "num_input_tokens_seen": 124258410, + "step": 5775, + "time_per_iteration": 2.4506568908691406 + }, + { + "auxiliary_loss_clip": 0.0103796, + "auxiliary_loss_mlp": 0.01003887, + "balance_loss_clip": 1.01909065, + "balance_loss_mlp": 1.0028199, + "epoch": 0.6945229363313894, + "flos": 70677191537280.0, + "grad_norm": 0.8132748977895188, + "language_loss": 0.58186227, + "learning_rate": 9.014400705035261e-07, + "loss": 0.60228074, + "num_input_tokens_seen": 124315315, + "step": 5776, + "time_per_iteration": 3.81909441947937 + }, + { + "auxiliary_loss_clip": 0.01166827, + "auxiliary_loss_mlp": 0.01022395, + "balance_loss_clip": 1.0503571, + "balance_loss_mlp": 1.01520073, + "epoch": 0.6946431792220285, + "flos": 18952934267520.0, + "grad_norm": 1.9368996722258527, + "language_loss": 0.76930261, + "learning_rate": 9.00789215231414e-07, + "loss": 0.79119486, + "num_input_tokens_seen": 124333710, + "step": 5777, + "time_per_iteration": 3.235917568206787 + }, + { + "auxiliary_loss_clip": 0.0112462, + "auxiliary_loss_mlp": 0.00762161, + "balance_loss_clip": 1.0406965, + "balance_loss_mlp": 1.00055563, + "epoch": 0.6947634221126676, + "flos": 20338834671360.0, + "grad_norm": 1.7351386482825102, + "language_loss": 0.81811237, + "learning_rate": 9.001385267087056e-07, + "loss": 0.83698022, + "num_input_tokens_seen": 124352855, + "step": 5778, + "time_per_iteration": 2.556151866912842 + }, + { + "auxiliary_loss_clip": 0.01157447, + "auxiliary_loss_mlp": 0.01022054, + "balance_loss_clip": 1.04842675, + "balance_loss_mlp": 1.01494598, + "epoch": 0.6948836650033067, + "flos": 21833723917440.0, + "grad_norm": 1.4755900027901134, + "language_loss": 0.70272839, + "learning_rate": 8.994880050341072e-07, + "loss": 0.72452343, + "num_input_tokens_seen": 124372960, + "step": 5779, + "time_per_iteration": 2.4567015171051025 + }, + { + "auxiliary_loss_clip": 0.01135167, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.04640913, + "balance_loss_mlp": 1.0272944, + "epoch": 0.6950039078939457, + "flos": 23657519024640.0, + "grad_norm": 2.1235507986752165, + "language_loss": 0.77724135, + "learning_rate": 8.988376503063026e-07, + "loss": 0.79893911, + "num_input_tokens_seen": 124394220, + "step": 5780, + "time_per_iteration": 3.2974607944488525 + }, + { + "auxiliary_loss_clip": 0.01122355, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.04586256, + "balance_loss_mlp": 1.01874423, + "epoch": 0.6951241507845849, + "flos": 21792462168960.0, + "grad_norm": 1.755732389878013, + "language_loss": 0.8114289, + "learning_rate": 8.981874626239521e-07, + "loss": 0.83292007, + "num_input_tokens_seen": 124412795, + "step": 5781, + "time_per_iteration": 2.547027349472046 + }, + { + "auxiliary_loss_clip": 0.01155894, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.04939628, + "balance_loss_mlp": 1.02349496, + "epoch": 0.695244393675224, + "flos": 14647568244480.0, + "grad_norm": 2.431478494265423, + "language_loss": 0.8810094, + "learning_rate": 8.975374420856872e-07, + "loss": 0.90288079, + "num_input_tokens_seen": 124429690, + "step": 5782, + "time_per_iteration": 2.413693428039551 + }, + { + "auxiliary_loss_clip": 0.01116878, + "auxiliary_loss_mlp": 0.01021501, + "balance_loss_clip": 1.04174519, + "balance_loss_mlp": 1.0145278, + "epoch": 0.695364636565863, + "flos": 16873203778560.0, + "grad_norm": 4.236207942755898, + "language_loss": 0.72964561, + "learning_rate": 8.968875887901157e-07, + "loss": 0.75102937, + "num_input_tokens_seen": 124447070, + "step": 5783, + "time_per_iteration": 3.23355770111084 + }, + { + "auxiliary_loss_clip": 0.011393, + "auxiliary_loss_mlp": 0.0102304, + "balance_loss_clip": 1.04344916, + "balance_loss_mlp": 1.01535654, + "epoch": 0.6954848794565022, + "flos": 19354523299200.0, + "grad_norm": 1.9328052486000094, + "language_loss": 0.63046283, + "learning_rate": 8.9623790283582e-07, + "loss": 0.6520862, + "num_input_tokens_seen": 124464950, + "step": 5784, + "time_per_iteration": 2.4698426723480225 + }, + { + "auxiliary_loss_clip": 0.0112813, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.04537439, + "balance_loss_mlp": 1.02193046, + "epoch": 0.6956051223471412, + "flos": 18990209606400.0, + "grad_norm": 2.340605641380878, + "language_loss": 0.76530075, + "learning_rate": 8.955883843213561e-07, + "loss": 0.78687739, + "num_input_tokens_seen": 124483965, + "step": 5785, + "time_per_iteration": 2.511528253555298 + }, + { + "auxiliary_loss_clip": 0.01160859, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.04810762, + "balance_loss_mlp": 1.01949668, + "epoch": 0.6957253652377803, + "flos": 16107229226880.0, + "grad_norm": 1.8928675731567448, + "language_loss": 0.86788189, + "learning_rate": 8.949390333452569e-07, + "loss": 0.88976467, + "num_input_tokens_seen": 124501910, + "step": 5786, + "time_per_iteration": 2.4222774505615234 + }, + { + "auxiliary_loss_clip": 0.01167613, + "auxiliary_loss_mlp": 0.01025162, + "balance_loss_clip": 1.04974055, + "balance_loss_mlp": 1.01804829, + "epoch": 0.6958456081284194, + "flos": 29388646569600.0, + "grad_norm": 1.6813816244161495, + "language_loss": 0.67475069, + "learning_rate": 8.942898500060279e-07, + "loss": 0.69667846, + "num_input_tokens_seen": 124521625, + "step": 5787, + "time_per_iteration": 2.4769599437713623 + }, + { + "auxiliary_loss_clip": 0.01118087, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.04379165, + "balance_loss_mlp": 1.01562631, + "epoch": 0.6959658510190585, + "flos": 25154850395520.0, + "grad_norm": 2.777945724328948, + "language_loss": 0.71817529, + "learning_rate": 8.936408344021493e-07, + "loss": 0.73958755, + "num_input_tokens_seen": 124538540, + "step": 5788, + "time_per_iteration": 2.5563461780548096 + }, + { + "auxiliary_loss_clip": 0.01150771, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.04954803, + "balance_loss_mlp": 1.02118313, + "epoch": 0.6960860939096976, + "flos": 42814388759040.0, + "grad_norm": 2.1689027919672967, + "language_loss": 0.71130282, + "learning_rate": 8.929919866320765e-07, + "loss": 0.73310685, + "num_input_tokens_seen": 124559355, + "step": 5789, + "time_per_iteration": 2.675229549407959 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.00762327, + "balance_loss_clip": 1.04247475, + "balance_loss_mlp": 1.00050497, + "epoch": 0.6962063368003367, + "flos": 17566566986880.0, + "grad_norm": 1.8505713536597757, + "language_loss": 0.81092548, + "learning_rate": 8.923433067942385e-07, + "loss": 0.82985663, + "num_input_tokens_seen": 124577920, + "step": 5790, + "time_per_iteration": 2.5029871463775635 + }, + { + "auxiliary_loss_clip": 0.01134611, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.04621625, + "balance_loss_mlp": 1.02006578, + "epoch": 0.6963265796909758, + "flos": 21251648021760.0, + "grad_norm": 1.854768305622176, + "language_loss": 0.68871766, + "learning_rate": 8.916947949870417e-07, + "loss": 0.71033543, + "num_input_tokens_seen": 124597585, + "step": 5791, + "time_per_iteration": 2.5244009494781494 + }, + { + "auxiliary_loss_clip": 0.0106135, + "auxiliary_loss_mlp": 0.01002651, + "balance_loss_clip": 1.01484227, + "balance_loss_mlp": 1.00163805, + "epoch": 0.6964468225816148, + "flos": 68828295801600.0, + "grad_norm": 0.750887477114594, + "language_loss": 0.58145362, + "learning_rate": 8.910464513088615e-07, + "loss": 0.60209364, + "num_input_tokens_seen": 124661625, + "step": 5792, + "time_per_iteration": 3.1116039752960205 + }, + { + "auxiliary_loss_clip": 0.01133065, + "auxiliary_loss_mlp": 0.01022869, + "balance_loss_clip": 1.0440166, + "balance_loss_mlp": 1.0150156, + "epoch": 0.696567065472254, + "flos": 18950887192320.0, + "grad_norm": 5.252957896290847, + "language_loss": 0.7840808, + "learning_rate": 8.903982758580542e-07, + "loss": 0.8056401, + "num_input_tokens_seen": 124680565, + "step": 5793, + "time_per_iteration": 2.513719320297241 + }, + { + "auxiliary_loss_clip": 0.01138294, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.046345, + "balance_loss_mlp": 1.02561283, + "epoch": 0.696687308362893, + "flos": 22856675345280.0, + "grad_norm": 1.9795699831082307, + "language_loss": 0.80265558, + "learning_rate": 8.897502687329457e-07, + "loss": 0.82436848, + "num_input_tokens_seen": 124700365, + "step": 5794, + "time_per_iteration": 2.4889466762542725 + }, + { + "auxiliary_loss_clip": 0.01122406, + "auxiliary_loss_mlp": 0.0102249, + "balance_loss_clip": 1.04350555, + "balance_loss_mlp": 1.0154984, + "epoch": 0.6968075512535321, + "flos": 24972926987520.0, + "grad_norm": 2.0478345205776205, + "language_loss": 0.79856312, + "learning_rate": 8.891024300318382e-07, + "loss": 0.82001209, + "num_input_tokens_seen": 124718935, + "step": 5795, + "time_per_iteration": 2.589400291442871 + }, + { + "auxiliary_loss_clip": 0.01117054, + "auxiliary_loss_mlp": 0.01023169, + "balance_loss_clip": 1.04173446, + "balance_loss_mlp": 1.01642752, + "epoch": 0.6969277941441713, + "flos": 21030438113280.0, + "grad_norm": 2.7166445141360644, + "language_loss": 0.75870109, + "learning_rate": 8.884547598530103e-07, + "loss": 0.78010333, + "num_input_tokens_seen": 124739505, + "step": 5796, + "time_per_iteration": 2.5714428424835205 + }, + { + "auxiliary_loss_clip": 0.01071477, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.03705394, + "balance_loss_mlp": 1.02289915, + "epoch": 0.6970480370348103, + "flos": 21579404647680.0, + "grad_norm": 1.8499792818007779, + "language_loss": 0.75178993, + "learning_rate": 8.8780725829471e-07, + "loss": 0.77280957, + "num_input_tokens_seen": 124757410, + "step": 5797, + "time_per_iteration": 2.6393935680389404 + }, + { + "auxiliary_loss_clip": 0.01168999, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.0486027, + "balance_loss_mlp": 1.01946163, + "epoch": 0.6971682799254494, + "flos": 22419175691520.0, + "grad_norm": 1.9340346040666845, + "language_loss": 0.78222102, + "learning_rate": 8.87159925455165e-07, + "loss": 0.80418277, + "num_input_tokens_seen": 124777240, + "step": 5798, + "time_per_iteration": 2.430356979370117 + }, + { + "auxiliary_loss_clip": 0.01122592, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.04484296, + "balance_loss_mlp": 1.02086091, + "epoch": 0.6972885228160886, + "flos": 20005834659840.0, + "grad_norm": 2.0074781583069417, + "language_loss": 0.73301578, + "learning_rate": 8.865127614325738e-07, + "loss": 0.75452042, + "num_input_tokens_seen": 124795670, + "step": 5799, + "time_per_iteration": 2.520542621612549 + }, + { + "auxiliary_loss_clip": 0.01132629, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.04342294, + "balance_loss_mlp": 1.01954317, + "epoch": 0.6974087657067276, + "flos": 37853437656960.0, + "grad_norm": 1.983450800859278, + "language_loss": 0.66537815, + "learning_rate": 8.85865766325113e-07, + "loss": 0.68697637, + "num_input_tokens_seen": 124819600, + "step": 5800, + "time_per_iteration": 2.626162528991699 + }, + { + "auxiliary_loss_clip": 0.01136049, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.04439139, + "balance_loss_mlp": 1.01776469, + "epoch": 0.6975290085973667, + "flos": 29489267543040.0, + "grad_norm": 2.192836506003847, + "language_loss": 0.72344482, + "learning_rate": 8.852189402309287e-07, + "loss": 0.74505556, + "num_input_tokens_seen": 124838785, + "step": 5801, + "time_per_iteration": 2.5396711826324463 + }, + { + "auxiliary_loss_clip": 0.0115447, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.04866207, + "balance_loss_mlp": 1.01889789, + "epoch": 0.6976492514880057, + "flos": 12895630295040.0, + "grad_norm": 2.2069679831906193, + "language_loss": 0.74260998, + "learning_rate": 8.845722832481441e-07, + "loss": 0.76441282, + "num_input_tokens_seen": 124854215, + "step": 5802, + "time_per_iteration": 3.125596761703491 + }, + { + "auxiliary_loss_clip": 0.01153587, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.04753709, + "balance_loss_mlp": 1.01719534, + "epoch": 0.6977694943786449, + "flos": 24352929308160.0, + "grad_norm": 1.85412657963025, + "language_loss": 0.77529085, + "learning_rate": 8.83925795474858e-07, + "loss": 0.79706967, + "num_input_tokens_seen": 124874340, + "step": 5803, + "time_per_iteration": 2.491938352584839 + }, + { + "auxiliary_loss_clip": 0.01122467, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_clip": 1.04618394, + "balance_loss_mlp": 1.01614308, + "epoch": 0.6978897372692839, + "flos": 29898470257920.0, + "grad_norm": 3.696652994148292, + "language_loss": 0.58865392, + "learning_rate": 8.832794770091414e-07, + "loss": 0.61011744, + "num_input_tokens_seen": 124895175, + "step": 5804, + "time_per_iteration": 3.4000065326690674 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01025692, + "balance_loss_clip": 1.04633486, + "balance_loss_mlp": 1.01811647, + "epoch": 0.698009980159923, + "flos": 21761579450880.0, + "grad_norm": 2.0113103291204126, + "language_loss": 0.82701099, + "learning_rate": 8.826333279490401e-07, + "loss": 0.84871662, + "num_input_tokens_seen": 124915810, + "step": 5805, + "time_per_iteration": 2.49599289894104 + }, + { + "auxiliary_loss_clip": 0.01143454, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.04748416, + "balance_loss_mlp": 1.0194962, + "epoch": 0.6981302230505622, + "flos": 19857164267520.0, + "grad_norm": 2.0915443591533593, + "language_loss": 0.67778647, + "learning_rate": 8.819873483925748e-07, + "loss": 0.69948518, + "num_input_tokens_seen": 124932930, + "step": 5806, + "time_per_iteration": 2.4617981910705566 + }, + { + "auxiliary_loss_clip": 0.01129893, + "auxiliary_loss_mlp": 0.00761986, + "balance_loss_clip": 1.04670787, + "balance_loss_mlp": 1.00050414, + "epoch": 0.6982504659412012, + "flos": 22198648141440.0, + "grad_norm": 1.9298570994482234, + "language_loss": 0.74184799, + "learning_rate": 8.81341538437739e-07, + "loss": 0.7607668, + "num_input_tokens_seen": 124951220, + "step": 5807, + "time_per_iteration": 3.4212121963500977 + }, + { + "auxiliary_loss_clip": 0.01142352, + "auxiliary_loss_mlp": 0.01022129, + "balance_loss_clip": 1.04330921, + "balance_loss_mlp": 1.0147146, + "epoch": 0.6983707088318403, + "flos": 35588479708800.0, + "grad_norm": 1.9772594302035267, + "language_loss": 0.68066496, + "learning_rate": 8.80695898182503e-07, + "loss": 0.70230979, + "num_input_tokens_seen": 124972200, + "step": 5808, + "time_per_iteration": 2.6189494132995605 + }, + { + "auxiliary_loss_clip": 0.01060003, + "auxiliary_loss_mlp": 0.01001457, + "balance_loss_clip": 1.02090359, + "balance_loss_mlp": 1.00030029, + "epoch": 0.6984909517224794, + "flos": 65440052760960.0, + "grad_norm": 0.8101711278610646, + "language_loss": 0.65125275, + "learning_rate": 8.800504277248093e-07, + "loss": 0.67186737, + "num_input_tokens_seen": 125036950, + "step": 5809, + "time_per_iteration": 3.8643782138824463 + }, + { + "auxiliary_loss_clip": 0.01127645, + "auxiliary_loss_mlp": 0.00762036, + "balance_loss_clip": 1.05123889, + "balance_loss_mlp": 1.00052738, + "epoch": 0.6986111946131185, + "flos": 18546927863040.0, + "grad_norm": 2.212635760890346, + "language_loss": 0.75038385, + "learning_rate": 8.794051271625753e-07, + "loss": 0.76928067, + "num_input_tokens_seen": 125054585, + "step": 5810, + "time_per_iteration": 2.5123722553253174 + }, + { + "auxiliary_loss_clip": 0.01138871, + "auxiliary_loss_mlp": 0.01024201, + "balance_loss_clip": 1.0467298, + "balance_loss_mlp": 1.01727223, + "epoch": 0.6987314375037575, + "flos": 23039173370880.0, + "grad_norm": 2.6801948281864836, + "language_loss": 0.83186746, + "learning_rate": 8.787599965936925e-07, + "loss": 0.85349822, + "num_input_tokens_seen": 125075515, + "step": 5811, + "time_per_iteration": 2.506580114364624 + }, + { + "auxiliary_loss_clip": 0.01120556, + "auxiliary_loss_mlp": 0.01023639, + "balance_loss_clip": 1.04547179, + "balance_loss_mlp": 1.01669192, + "epoch": 0.6988516803943967, + "flos": 38400393029760.0, + "grad_norm": 1.715966941119164, + "language_loss": 0.72301733, + "learning_rate": 8.781150361160261e-07, + "loss": 0.74445927, + "num_input_tokens_seen": 125097425, + "step": 5812, + "time_per_iteration": 2.6715238094329834 + }, + { + "auxiliary_loss_clip": 0.01130334, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.04548073, + "balance_loss_mlp": 1.01907253, + "epoch": 0.6989719232850358, + "flos": 24096993926400.0, + "grad_norm": 1.5827104973818182, + "language_loss": 0.73502684, + "learning_rate": 8.774702458274181e-07, + "loss": 0.75659364, + "num_input_tokens_seen": 125117830, + "step": 5813, + "time_per_iteration": 2.5260837078094482 + }, + { + "auxiliary_loss_clip": 0.01156398, + "auxiliary_loss_mlp": 0.01025836, + "balance_loss_clip": 1.04887247, + "balance_loss_mlp": 1.01809919, + "epoch": 0.6990921661756748, + "flos": 14866838818560.0, + "grad_norm": 2.3991094318439297, + "language_loss": 0.70330489, + "learning_rate": 8.768256258256799e-07, + "loss": 0.72512722, + "num_input_tokens_seen": 125134455, + "step": 5814, + "time_per_iteration": 2.4202916622161865 + }, + { + "auxiliary_loss_clip": 0.01157571, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.0483619, + "balance_loss_mlp": 1.02130151, + "epoch": 0.699212409066314, + "flos": 20193719725440.0, + "grad_norm": 1.6305378040662866, + "language_loss": 0.74216193, + "learning_rate": 8.76181176208602e-07, + "loss": 0.76402497, + "num_input_tokens_seen": 125152555, + "step": 5815, + "time_per_iteration": 2.4460389614105225 + }, + { + "auxiliary_loss_clip": 0.01102664, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.04071593, + "balance_loss_mlp": 1.02374852, + "epoch": 0.699332651956953, + "flos": 19427888828160.0, + "grad_norm": 1.9788421832959064, + "language_loss": 0.73437583, + "learning_rate": 8.755368970739461e-07, + "loss": 0.75572062, + "num_input_tokens_seen": 125171915, + "step": 5816, + "time_per_iteration": 2.550799608230591 + }, + { + "auxiliary_loss_clip": 0.01130549, + "auxiliary_loss_mlp": 0.01026508, + "balance_loss_clip": 1.04349005, + "balance_loss_mlp": 1.01863408, + "epoch": 0.6994528948475921, + "flos": 16143714466560.0, + "grad_norm": 3.259344368100055, + "language_loss": 0.61532134, + "learning_rate": 8.748927885194479e-07, + "loss": 0.6368919, + "num_input_tokens_seen": 125190220, + "step": 5817, + "time_per_iteration": 2.5015554428100586 + }, + { + "auxiliary_loss_clip": 0.01027549, + "auxiliary_loss_mlp": 0.01004755, + "balance_loss_clip": 1.01224947, + "balance_loss_mlp": 1.00376534, + "epoch": 0.6995731377382313, + "flos": 64952420699520.0, + "grad_norm": 0.7905983814474801, + "language_loss": 0.57417536, + "learning_rate": 8.742488506428209e-07, + "loss": 0.5944984, + "num_input_tokens_seen": 125249310, + "step": 5818, + "time_per_iteration": 3.0465736389160156 + }, + { + "auxiliary_loss_clip": 0.01143125, + "auxiliary_loss_mlp": 0.00761813, + "balance_loss_clip": 1.04676628, + "balance_loss_mlp": 1.00048351, + "epoch": 0.6996933806288703, + "flos": 24900136076160.0, + "grad_norm": 1.8095859629611606, + "language_loss": 0.78231585, + "learning_rate": 8.736050835417466e-07, + "loss": 0.8013652, + "num_input_tokens_seen": 125269350, + "step": 5819, + "time_per_iteration": 2.534228563308716 + }, + { + "auxiliary_loss_clip": 0.01159359, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.04909015, + "balance_loss_mlp": 1.01872015, + "epoch": 0.6998136235195094, + "flos": 20777806782720.0, + "grad_norm": 1.8192816756510883, + "language_loss": 0.61549938, + "learning_rate": 8.729614873138862e-07, + "loss": 0.63735652, + "num_input_tokens_seen": 125286985, + "step": 5820, + "time_per_iteration": 2.450200319290161 + }, + { + "auxiliary_loss_clip": 0.01121254, + "auxiliary_loss_mlp": 0.01026307, + "balance_loss_clip": 1.0469892, + "balance_loss_mlp": 1.01854026, + "epoch": 0.6999338664101485, + "flos": 23733470332800.0, + "grad_norm": 1.903322738519752, + "language_loss": 0.77887797, + "learning_rate": 8.723180620568716e-07, + "loss": 0.80035359, + "num_input_tokens_seen": 125306240, + "step": 5821, + "time_per_iteration": 2.588735818862915 + }, + { + "auxiliary_loss_clip": 0.01143045, + "auxiliary_loss_mlp": 0.01022418, + "balance_loss_clip": 1.0449996, + "balance_loss_mlp": 1.01518822, + "epoch": 0.7000541093007876, + "flos": 19864598382720.0, + "grad_norm": 1.7493931980047281, + "language_loss": 0.85130024, + "learning_rate": 8.716748078683116e-07, + "loss": 0.87295485, + "num_input_tokens_seen": 125323015, + "step": 5822, + "time_per_iteration": 2.47420597076416 + }, + { + "auxiliary_loss_clip": 0.01073595, + "auxiliary_loss_mlp": 0.01027216, + "balance_loss_clip": 1.03788662, + "balance_loss_mlp": 1.01860905, + "epoch": 0.7001743521914267, + "flos": 29679056029440.0, + "grad_norm": 2.055232501455391, + "language_loss": 0.68827254, + "learning_rate": 8.710317248457855e-07, + "loss": 0.70928067, + "num_input_tokens_seen": 125342630, + "step": 5823, + "time_per_iteration": 2.6623828411102295 + }, + { + "auxiliary_loss_clip": 0.01136376, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.04667652, + "balance_loss_mlp": 1.01930285, + "epoch": 0.7002945950820658, + "flos": 27489762080640.0, + "grad_norm": 1.7973682424863193, + "language_loss": 0.72462505, + "learning_rate": 8.703888130868482e-07, + "loss": 0.74625695, + "num_input_tokens_seen": 125364480, + "step": 5824, + "time_per_iteration": 2.5532102584838867 + }, + { + "auxiliary_loss_clip": 0.01126737, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.0446353, + "balance_loss_mlp": 1.01682925, + "epoch": 0.7004148379727049, + "flos": 22158463800960.0, + "grad_norm": 2.822393128196926, + "language_loss": 0.82287407, + "learning_rate": 8.697460726890307e-07, + "loss": 0.84437668, + "num_input_tokens_seen": 125381625, + "step": 5825, + "time_per_iteration": 2.5144972801208496 + }, + { + "auxiliary_loss_clip": 0.011252, + "auxiliary_loss_mlp": 0.00762111, + "balance_loss_clip": 1.04229474, + "balance_loss_mlp": 1.00047016, + "epoch": 0.7005350808633439, + "flos": 19423758764160.0, + "grad_norm": 1.9369837674046624, + "language_loss": 0.90281129, + "learning_rate": 8.691035037498354e-07, + "loss": 0.92168444, + "num_input_tokens_seen": 125397615, + "step": 5826, + "time_per_iteration": 2.5030462741851807 + }, + { + "auxiliary_loss_clip": 0.01136311, + "auxiliary_loss_mlp": 0.01024248, + "balance_loss_clip": 1.04306197, + "balance_loss_mlp": 1.0170002, + "epoch": 0.7006553237539831, + "flos": 23476708938240.0, + "grad_norm": 1.8795623109171808, + "language_loss": 0.72476053, + "learning_rate": 8.684611063667391e-07, + "loss": 0.74636608, + "num_input_tokens_seen": 125418080, + "step": 5827, + "time_per_iteration": 2.5319175720214844 + }, + { + "auxiliary_loss_clip": 0.01153478, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.04597783, + "balance_loss_mlp": 1.0171659, + "epoch": 0.7007755666446221, + "flos": 31212872640000.0, + "grad_norm": 1.7575375963367066, + "language_loss": 0.76704061, + "learning_rate": 8.678188806371935e-07, + "loss": 0.78881454, + "num_input_tokens_seen": 125440115, + "step": 5828, + "time_per_iteration": 2.565089464187622 + }, + { + "auxiliary_loss_clip": 0.01153655, + "auxiliary_loss_mlp": 0.01023954, + "balance_loss_clip": 1.04608214, + "balance_loss_mlp": 1.01746929, + "epoch": 0.7008958095352612, + "flos": 18149899858560.0, + "grad_norm": 1.627767872174666, + "language_loss": 0.85306025, + "learning_rate": 8.671768266586228e-07, + "loss": 0.87483633, + "num_input_tokens_seen": 125458240, + "step": 5829, + "time_per_iteration": 3.2037551403045654 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01025652, + "balance_loss_clip": 1.04342556, + "balance_loss_mlp": 1.01865458, + "epoch": 0.7010160524259004, + "flos": 27452307173760.0, + "grad_norm": 1.675482902238848, + "language_loss": 0.78192276, + "learning_rate": 8.665349445284275e-07, + "loss": 0.80341399, + "num_input_tokens_seen": 125477980, + "step": 5830, + "time_per_iteration": 2.5903918743133545 + }, + { + "auxiliary_loss_clip": 0.01126801, + "auxiliary_loss_mlp": 0.01021239, + "balance_loss_clip": 1.04730821, + "balance_loss_mlp": 1.01403618, + "epoch": 0.7011362953165394, + "flos": 23842064125440.0, + "grad_norm": 1.459501148550249, + "language_loss": 0.81086689, + "learning_rate": 8.658932343439799e-07, + "loss": 0.83234727, + "num_input_tokens_seen": 125497765, + "step": 5831, + "time_per_iteration": 3.386246919631958 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.04845846, + "balance_loss_mlp": 1.02040112, + "epoch": 0.7012565382071785, + "flos": 24823430582400.0, + "grad_norm": 1.9015881522778693, + "language_loss": 0.77469099, + "learning_rate": 8.65251696202627e-07, + "loss": 0.79666102, + "num_input_tokens_seen": 125514145, + "step": 5832, + "time_per_iteration": 2.444854259490967 + }, + { + "auxiliary_loss_clip": 0.01130083, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.04674983, + "balance_loss_mlp": 1.01758909, + "epoch": 0.7013767810978175, + "flos": 21397445326080.0, + "grad_norm": 2.2945403525867394, + "language_loss": 0.87699163, + "learning_rate": 8.646103302016896e-07, + "loss": 0.89854187, + "num_input_tokens_seen": 125533115, + "step": 5833, + "time_per_iteration": 3.3560352325439453 + }, + { + "auxiliary_loss_clip": 0.01122077, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.04351306, + "balance_loss_mlp": 1.01680541, + "epoch": 0.7014970239884567, + "flos": 16687150306560.0, + "grad_norm": 1.8912571964271003, + "language_loss": 0.88527739, + "learning_rate": 8.639691364384614e-07, + "loss": 0.9067421, + "num_input_tokens_seen": 125550740, + "step": 5834, + "time_per_iteration": 2.555995464324951 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.04683483, + "balance_loss_mlp": 1.02082062, + "epoch": 0.7016172668790958, + "flos": 12568268718720.0, + "grad_norm": 1.9878251352231335, + "language_loss": 0.7277925, + "learning_rate": 8.633281150102136e-07, + "loss": 0.74951148, + "num_input_tokens_seen": 125567590, + "step": 5835, + "time_per_iteration": 2.4555845260620117 + }, + { + "auxiliary_loss_clip": 0.01140318, + "auxiliary_loss_mlp": 0.01021705, + "balance_loss_clip": 1.04754233, + "balance_loss_mlp": 1.0147016, + "epoch": 0.7017375097697348, + "flos": 17452729808640.0, + "grad_norm": 5.251000715413512, + "language_loss": 0.67891234, + "learning_rate": 8.626872660141855e-07, + "loss": 0.70053256, + "num_input_tokens_seen": 125585500, + "step": 5836, + "time_per_iteration": 3.2454843521118164 + }, + { + "auxiliary_loss_clip": 0.01112075, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.04408634, + "balance_loss_mlp": 1.02060997, + "epoch": 0.701857752660374, + "flos": 18513028402560.0, + "grad_norm": 1.971044312174865, + "language_loss": 0.7454384, + "learning_rate": 8.620465895475957e-07, + "loss": 0.7668395, + "num_input_tokens_seen": 125603720, + "step": 5837, + "time_per_iteration": 2.5258865356445312 + }, + { + "auxiliary_loss_clip": 0.01107985, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.04362595, + "balance_loss_mlp": 1.01782727, + "epoch": 0.701977995551013, + "flos": 24425971614720.0, + "grad_norm": 1.584783907692154, + "language_loss": 0.75290573, + "learning_rate": 8.614060857076333e-07, + "loss": 0.77423656, + "num_input_tokens_seen": 125624390, + "step": 5838, + "time_per_iteration": 2.5844318866729736 + }, + { + "auxiliary_loss_clip": 0.01134269, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.04340684, + "balance_loss_mlp": 1.02098477, + "epoch": 0.7020982384416521, + "flos": 23002759958400.0, + "grad_norm": 1.9210371169033498, + "language_loss": 0.74943495, + "learning_rate": 8.60765754591462e-07, + "loss": 0.7710644, + "num_input_tokens_seen": 125644085, + "step": 5839, + "time_per_iteration": 2.496074914932251 + }, + { + "auxiliary_loss_clip": 0.0116628, + "auxiliary_loss_mlp": 0.01024093, + "balance_loss_clip": 1.04787827, + "balance_loss_mlp": 1.01695824, + "epoch": 0.7022184813322913, + "flos": 20449080489600.0, + "grad_norm": 2.405025307998839, + "language_loss": 0.72841823, + "learning_rate": 8.601255962962211e-07, + "loss": 0.75032198, + "num_input_tokens_seen": 125663095, + "step": 5840, + "time_per_iteration": 2.4173026084899902 + }, + { + "auxiliary_loss_clip": 0.0116535, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.05135846, + "balance_loss_mlp": 1.0206008, + "epoch": 0.7023387242229303, + "flos": 19790514581760.0, + "grad_norm": 2.4169471783817555, + "language_loss": 0.72169554, + "learning_rate": 8.594856109190194e-07, + "loss": 0.74363697, + "num_input_tokens_seen": 125680125, + "step": 5841, + "time_per_iteration": 2.4390385150909424 + }, + { + "auxiliary_loss_clip": 0.01168743, + "auxiliary_loss_mlp": 0.01023853, + "balance_loss_clip": 1.04885232, + "balance_loss_mlp": 1.01621222, + "epoch": 0.7024589671135694, + "flos": 33259278286080.0, + "grad_norm": 1.6536870667754038, + "language_loss": 0.68811655, + "learning_rate": 8.588457985569446e-07, + "loss": 0.71004248, + "num_input_tokens_seen": 125703035, + "step": 5842, + "time_per_iteration": 2.551112413406372 + }, + { + "auxiliary_loss_clip": 0.01170963, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.04891908, + "balance_loss_mlp": 1.01988006, + "epoch": 0.7025792100042085, + "flos": 19098982967040.0, + "grad_norm": 2.325769154993244, + "language_loss": 0.72026587, + "learning_rate": 8.582061593070542e-07, + "loss": 0.74224973, + "num_input_tokens_seen": 125723765, + "step": 5843, + "time_per_iteration": 2.433300733566284 + }, + { + "auxiliary_loss_clip": 0.01169784, + "auxiliary_loss_mlp": 0.00761986, + "balance_loss_clip": 1.04952431, + "balance_loss_mlp": 1.00046539, + "epoch": 0.7026994528948476, + "flos": 18952611045120.0, + "grad_norm": 2.09562273630407, + "language_loss": 0.77006441, + "learning_rate": 8.57566693266383e-07, + "loss": 0.78938204, + "num_input_tokens_seen": 125741455, + "step": 5844, + "time_per_iteration": 2.410703659057617 + }, + { + "auxiliary_loss_clip": 0.01145536, + "auxiliary_loss_mlp": 0.00762762, + "balance_loss_clip": 1.04570544, + "balance_loss_mlp": 1.00048518, + "epoch": 0.7028196957854866, + "flos": 19536662188800.0, + "grad_norm": 6.044492178560601, + "language_loss": 0.69362265, + "learning_rate": 8.569274005319354e-07, + "loss": 0.71270561, + "num_input_tokens_seen": 125759855, + "step": 5845, + "time_per_iteration": 2.4974935054779053 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.01026063, + "balance_loss_clip": 1.04639792, + "balance_loss_mlp": 1.01847196, + "epoch": 0.7029399386761258, + "flos": 20845318394880.0, + "grad_norm": 1.8238291304949386, + "language_loss": 0.79649365, + "learning_rate": 8.562882812006913e-07, + "loss": 0.81826216, + "num_input_tokens_seen": 125777345, + "step": 5846, + "time_per_iteration": 2.4432694911956787 + }, + { + "auxiliary_loss_clip": 0.01165123, + "auxiliary_loss_mlp": 0.01027856, + "balance_loss_clip": 1.04709482, + "balance_loss_mlp": 1.02019072, + "epoch": 0.7030601815667649, + "flos": 22055005653120.0, + "grad_norm": 2.237983222467098, + "language_loss": 0.77741587, + "learning_rate": 8.556493353696066e-07, + "loss": 0.79934567, + "num_input_tokens_seen": 125796345, + "step": 5847, + "time_per_iteration": 2.4328713417053223 + }, + { + "auxiliary_loss_clip": 0.01158836, + "auxiliary_loss_mlp": 0.00762288, + "balance_loss_clip": 1.05026376, + "balance_loss_mlp": 1.0005362, + "epoch": 0.7031804244574039, + "flos": 27198742089600.0, + "grad_norm": 2.322622129372933, + "language_loss": 0.68586165, + "learning_rate": 8.550105631356077e-07, + "loss": 0.70507288, + "num_input_tokens_seen": 125816070, + "step": 5848, + "time_per_iteration": 2.4995882511138916 + }, + { + "auxiliary_loss_clip": 0.01121023, + "auxiliary_loss_mlp": 0.01026495, + "balance_loss_clip": 1.04183519, + "balance_loss_mlp": 1.01860905, + "epoch": 0.7033006673480431, + "flos": 22379853277440.0, + "grad_norm": 1.845409076447715, + "language_loss": 0.77179873, + "learning_rate": 8.543719645955961e-07, + "loss": 0.79327393, + "num_input_tokens_seen": 125834400, + "step": 5849, + "time_per_iteration": 2.524508476257324 + }, + { + "auxiliary_loss_clip": 0.01141703, + "auxiliary_loss_mlp": 0.01022721, + "balance_loss_clip": 1.04646909, + "balance_loss_mlp": 1.01544976, + "epoch": 0.7034209102386821, + "flos": 24715986024960.0, + "grad_norm": 1.5584195849754012, + "language_loss": 0.74670684, + "learning_rate": 8.537335398464467e-07, + "loss": 0.76835114, + "num_input_tokens_seen": 125854720, + "step": 5850, + "time_per_iteration": 2.5125372409820557 + }, + { + "auxiliary_loss_clip": 0.01139523, + "auxiliary_loss_mlp": 0.01028532, + "balance_loss_clip": 1.04275489, + "balance_loss_mlp": 1.02102232, + "epoch": 0.7035411531293212, + "flos": 22556174163840.0, + "grad_norm": 2.5716388290327483, + "language_loss": 0.85361779, + "learning_rate": 8.53095288985007e-07, + "loss": 0.87529838, + "num_input_tokens_seen": 125868455, + "step": 5851, + "time_per_iteration": 2.4696247577667236 + }, + { + "auxiliary_loss_clip": 0.01166293, + "auxiliary_loss_mlp": 0.01022844, + "balance_loss_clip": 1.04903579, + "balance_loss_mlp": 1.01557183, + "epoch": 0.7036613960199604, + "flos": 22674967418880.0, + "grad_norm": 1.6514324453961566, + "language_loss": 0.82382655, + "learning_rate": 8.524572121081009e-07, + "loss": 0.84571797, + "num_input_tokens_seen": 125888555, + "step": 5852, + "time_per_iteration": 2.426699161529541 + }, + { + "auxiliary_loss_clip": 0.01159088, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.04752481, + "balance_loss_mlp": 1.02080822, + "epoch": 0.7037816389105994, + "flos": 22492146170880.0, + "grad_norm": 2.485132491332381, + "language_loss": 0.62506866, + "learning_rate": 8.518193093125232e-07, + "loss": 0.64694047, + "num_input_tokens_seen": 125907610, + "step": 5853, + "time_per_iteration": 2.4579017162323 + }, + { + "auxiliary_loss_clip": 0.01145507, + "auxiliary_loss_mlp": 0.01024203, + "balance_loss_clip": 1.04743564, + "balance_loss_mlp": 1.01741982, + "epoch": 0.7039018818012385, + "flos": 27087490690560.0, + "grad_norm": 1.6433352001801482, + "language_loss": 0.80866694, + "learning_rate": 8.511815806950436e-07, + "loss": 0.83036405, + "num_input_tokens_seen": 125928640, + "step": 5854, + "time_per_iteration": 2.591630697250366 + }, + { + "auxiliary_loss_clip": 0.01153622, + "auxiliary_loss_mlp": 0.01023292, + "balance_loss_clip": 1.04581761, + "balance_loss_mlp": 1.0159843, + "epoch": 0.7040221246918776, + "flos": 17749819198080.0, + "grad_norm": 1.6436640800644307, + "language_loss": 0.78003961, + "learning_rate": 8.505440263524044e-07, + "loss": 0.80180871, + "num_input_tokens_seen": 125947485, + "step": 5855, + "time_per_iteration": 3.2241318225860596 + }, + { + "auxiliary_loss_clip": 0.01155432, + "auxiliary_loss_mlp": 0.01022679, + "balance_loss_clip": 1.04542816, + "balance_loss_mlp": 1.01461482, + "epoch": 0.7041423675825167, + "flos": 16279851012480.0, + "grad_norm": 3.934870077160221, + "language_loss": 0.88130814, + "learning_rate": 8.49906646381322e-07, + "loss": 0.90308917, + "num_input_tokens_seen": 125960320, + "step": 5856, + "time_per_iteration": 2.444519281387329 + }, + { + "auxiliary_loss_clip": 0.01130476, + "auxiliary_loss_mlp": 0.01022495, + "balance_loss_clip": 1.04649091, + "balance_loss_mlp": 1.01571178, + "epoch": 0.7042626104731557, + "flos": 25483181639040.0, + "grad_norm": 2.630008703780646, + "language_loss": 0.72072744, + "learning_rate": 8.492694408784884e-07, + "loss": 0.74225724, + "num_input_tokens_seen": 125980575, + "step": 5857, + "time_per_iteration": 2.5613455772399902 + }, + { + "auxiliary_loss_clip": 0.01158781, + "auxiliary_loss_mlp": 0.01027697, + "balance_loss_clip": 1.04797339, + "balance_loss_mlp": 1.02060962, + "epoch": 0.7043828533637949, + "flos": 17857622891520.0, + "grad_norm": 3.749287466560704, + "language_loss": 0.62531662, + "learning_rate": 8.486324099405642e-07, + "loss": 0.64718139, + "num_input_tokens_seen": 125997420, + "step": 5858, + "time_per_iteration": 3.2527029514312744 + }, + { + "auxiliary_loss_clip": 0.0115246, + "auxiliary_loss_mlp": 0.01026192, + "balance_loss_clip": 1.0462575, + "balance_loss_mlp": 1.01937926, + "epoch": 0.704503096254434, + "flos": 29494259533440.0, + "grad_norm": 1.6526191757965571, + "language_loss": 0.74981928, + "learning_rate": 8.479955536641887e-07, + "loss": 0.77160579, + "num_input_tokens_seen": 126018915, + "step": 5859, + "time_per_iteration": 3.3527369499206543 + }, + { + "auxiliary_loss_clip": 0.0113179, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.04039085, + "balance_loss_mlp": 1.01920414, + "epoch": 0.704623339145073, + "flos": 30920739327360.0, + "grad_norm": 1.9982653628858016, + "language_loss": 0.66674972, + "learning_rate": 8.473588721459716e-07, + "loss": 0.68832934, + "num_input_tokens_seen": 126038825, + "step": 5860, + "time_per_iteration": 2.5654003620147705 + }, + { + "auxiliary_loss_clip": 0.01157067, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.05032432, + "balance_loss_mlp": 1.02970552, + "epoch": 0.7047435820357122, + "flos": 23914747296000.0, + "grad_norm": 2.081898189509378, + "language_loss": 0.70755208, + "learning_rate": 8.467223654824967e-07, + "loss": 0.72950417, + "num_input_tokens_seen": 126058280, + "step": 5861, + "time_per_iteration": 2.479520320892334 + }, + { + "auxiliary_loss_clip": 0.011484, + "auxiliary_loss_mlp": 0.01025183, + "balance_loss_clip": 1.04636121, + "balance_loss_mlp": 1.01795602, + "epoch": 0.7048638249263512, + "flos": 46494010926720.0, + "grad_norm": 1.795873877181477, + "language_loss": 0.62471318, + "learning_rate": 8.460860337703233e-07, + "loss": 0.64644897, + "num_input_tokens_seen": 126078885, + "step": 5862, + "time_per_iteration": 3.4283645153045654 + }, + { + "auxiliary_loss_clip": 0.0111548, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.04255819, + "balance_loss_mlp": 1.02251458, + "epoch": 0.7049840678169903, + "flos": 21689219502720.0, + "grad_norm": 1.743779418961563, + "language_loss": 0.70548517, + "learning_rate": 8.454498771059797e-07, + "loss": 0.7269457, + "num_input_tokens_seen": 126098260, + "step": 5863, + "time_per_iteration": 2.507906198501587 + }, + { + "auxiliary_loss_clip": 0.01106172, + "auxiliary_loss_mlp": 0.01024287, + "balance_loss_clip": 1.04242611, + "balance_loss_mlp": 1.01622832, + "epoch": 0.7051043107076294, + "flos": 18405081054720.0, + "grad_norm": 2.095799603692248, + "language_loss": 0.83618104, + "learning_rate": 8.448138955859725e-07, + "loss": 0.85748565, + "num_input_tokens_seen": 126114845, + "step": 5864, + "time_per_iteration": 2.513786792755127 + }, + { + "auxiliary_loss_clip": 0.01141321, + "auxiliary_loss_mlp": 0.01025458, + "balance_loss_clip": 1.04597449, + "balance_loss_mlp": 1.01797152, + "epoch": 0.7052245535982685, + "flos": 19319043640320.0, + "grad_norm": 1.8849638653558245, + "language_loss": 0.90191555, + "learning_rate": 8.44178089306778e-07, + "loss": 0.92358327, + "num_input_tokens_seen": 126132780, + "step": 5865, + "time_per_iteration": 2.4798450469970703 + }, + { + "auxiliary_loss_clip": 0.01167238, + "auxiliary_loss_mlp": 0.01023918, + "balance_loss_clip": 1.0483036, + "balance_loss_mlp": 1.01717961, + "epoch": 0.7053447964889076, + "flos": 19062138591360.0, + "grad_norm": 1.7480958491746597, + "language_loss": 0.7696026, + "learning_rate": 8.4354245836485e-07, + "loss": 0.79151416, + "num_input_tokens_seen": 126151225, + "step": 5866, + "time_per_iteration": 2.4040088653564453 + }, + { + "auxiliary_loss_clip": 0.01129014, + "auxiliary_loss_mlp": 0.0102406, + "balance_loss_clip": 1.04613924, + "balance_loss_mlp": 1.01572466, + "epoch": 0.7054650393795466, + "flos": 27379228953600.0, + "grad_norm": 1.4810631398635954, + "language_loss": 0.72855306, + "learning_rate": 8.429070028566108e-07, + "loss": 0.7500838, + "num_input_tokens_seen": 126172535, + "step": 5867, + "time_per_iteration": 2.573024272918701 + }, + { + "auxiliary_loss_clip": 0.01153429, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.04831207, + "balance_loss_mlp": 1.0206387, + "epoch": 0.7055852822701858, + "flos": 16102201322880.0, + "grad_norm": 1.826659008747168, + "language_loss": 0.75051713, + "learning_rate": 8.422717228784586e-07, + "loss": 0.77233487, + "num_input_tokens_seen": 126189410, + "step": 5868, + "time_per_iteration": 2.441877603530884 + }, + { + "auxiliary_loss_clip": 0.01113021, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.04725575, + "balance_loss_mlp": 1.01892662, + "epoch": 0.7057055251608249, + "flos": 11692299744000.0, + "grad_norm": 1.8004013758980293, + "language_loss": 0.69582367, + "learning_rate": 8.416366185267663e-07, + "loss": 0.7172206, + "num_input_tokens_seen": 126206910, + "step": 5869, + "time_per_iteration": 2.510385036468506 + }, + { + "auxiliary_loss_clip": 0.01154063, + "auxiliary_loss_mlp": 0.0102178, + "balance_loss_clip": 1.04589319, + "balance_loss_mlp": 1.01464534, + "epoch": 0.7058257680514639, + "flos": 22711560399360.0, + "grad_norm": 1.6831362460816388, + "language_loss": 0.78001696, + "learning_rate": 8.410016898978778e-07, + "loss": 0.80177534, + "num_input_tokens_seen": 126224385, + "step": 5870, + "time_per_iteration": 2.459655523300171 + }, + { + "auxiliary_loss_clip": 0.01112938, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.04558218, + "balance_loss_mlp": 1.0186981, + "epoch": 0.7059460109421031, + "flos": 17529543043200.0, + "grad_norm": 1.6492320170949961, + "language_loss": 0.78770697, + "learning_rate": 8.403669370881115e-07, + "loss": 0.80909574, + "num_input_tokens_seen": 126243120, + "step": 5871, + "time_per_iteration": 2.5525572299957275 + }, + { + "auxiliary_loss_clip": 0.01168931, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.04996181, + "balance_loss_mlp": 1.01989484, + "epoch": 0.7060662538327421, + "flos": 23544687427200.0, + "grad_norm": 1.8383117626121872, + "language_loss": 0.78571761, + "learning_rate": 8.397323601937587e-07, + "loss": 0.80767369, + "num_input_tokens_seen": 126263020, + "step": 5872, + "time_per_iteration": 2.429551362991333 + }, + { + "auxiliary_loss_clip": 0.01120571, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.04356551, + "balance_loss_mlp": 1.01910996, + "epoch": 0.7061864967233812, + "flos": 30260736875520.0, + "grad_norm": 1.745887519231487, + "language_loss": 0.76808316, + "learning_rate": 8.390979593110838e-07, + "loss": 0.78954977, + "num_input_tokens_seen": 126285150, + "step": 5873, + "time_per_iteration": 2.6032698154449463 + }, + { + "auxiliary_loss_clip": 0.01145353, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.04846871, + "balance_loss_mlp": 1.01808167, + "epoch": 0.7063067396140204, + "flos": 20701460424960.0, + "grad_norm": 1.4806571208470214, + "language_loss": 0.81541359, + "learning_rate": 8.384637345363262e-07, + "loss": 0.83712661, + "num_input_tokens_seen": 126304340, + "step": 5874, + "time_per_iteration": 2.5697927474975586 + }, + { + "auxiliary_loss_clip": 0.01132994, + "auxiliary_loss_mlp": 0.01023677, + "balance_loss_clip": 1.04261243, + "balance_loss_mlp": 1.01652193, + "epoch": 0.7064269825046594, + "flos": 32266168081920.0, + "grad_norm": 1.7318408462134016, + "language_loss": 0.766119, + "learning_rate": 8.378296859656964e-07, + "loss": 0.78768569, + "num_input_tokens_seen": 126325495, + "step": 5875, + "time_per_iteration": 2.5778212547302246 + }, + { + "auxiliary_loss_clip": 0.01141256, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.04652238, + "balance_loss_mlp": 1.02108204, + "epoch": 0.7065472253952985, + "flos": 30227124723840.0, + "grad_norm": 2.0260462488986963, + "language_loss": 0.68625867, + "learning_rate": 8.371958136953792e-07, + "loss": 0.70795649, + "num_input_tokens_seen": 126345525, + "step": 5876, + "time_per_iteration": 2.557030439376831 + }, + { + "auxiliary_loss_clip": 0.0112916, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.04245698, + "balance_loss_mlp": 1.02067327, + "epoch": 0.7066674682859376, + "flos": 16216720859520.0, + "grad_norm": 3.4497262046621087, + "language_loss": 0.66013145, + "learning_rate": 8.365621178215326e-07, + "loss": 0.68170869, + "num_input_tokens_seen": 126361995, + "step": 5877, + "time_per_iteration": 2.5502867698669434 + }, + { + "auxiliary_loss_clip": 0.01148615, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.04501164, + "balance_loss_mlp": 1.01883674, + "epoch": 0.7067877111765767, + "flos": 14830461319680.0, + "grad_norm": 2.467832362969757, + "language_loss": 0.75492555, + "learning_rate": 8.359285984402871e-07, + "loss": 0.77666628, + "num_input_tokens_seen": 126379260, + "step": 5878, + "time_per_iteration": 2.432802438735962 + }, + { + "auxiliary_loss_clip": 0.01134576, + "auxiliary_loss_mlp": 0.01022677, + "balance_loss_clip": 1.04621029, + "balance_loss_mlp": 1.01581955, + "epoch": 0.7069079540672157, + "flos": 25440196037760.0, + "grad_norm": 1.8604467206663504, + "language_loss": 0.73830485, + "learning_rate": 8.352952556477489e-07, + "loss": 0.75987744, + "num_input_tokens_seen": 126397170, + "step": 5879, + "time_per_iteration": 2.5171470642089844 + }, + { + "auxiliary_loss_clip": 0.01153866, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.04867709, + "balance_loss_mlp": 1.01946938, + "epoch": 0.7070281969578549, + "flos": 24607751368320.0, + "grad_norm": 1.9111363703637, + "language_loss": 0.76861405, + "learning_rate": 8.34662089539993e-07, + "loss": 0.79041731, + "num_input_tokens_seen": 126416680, + "step": 5880, + "time_per_iteration": 2.476982593536377 + }, + { + "auxiliary_loss_clip": 0.01166145, + "auxiliary_loss_mlp": 0.01024269, + "balance_loss_clip": 1.04948759, + "balance_loss_mlp": 1.01721179, + "epoch": 0.707148439848494, + "flos": 26724469887360.0, + "grad_norm": 9.695301230249642, + "language_loss": 0.79246104, + "learning_rate": 8.340291002130722e-07, + "loss": 0.81436527, + "num_input_tokens_seen": 126435870, + "step": 5881, + "time_per_iteration": 2.459730625152588 + }, + { + "auxiliary_loss_clip": 0.01170055, + "auxiliary_loss_mlp": 0.0102628, + "balance_loss_clip": 1.04911423, + "balance_loss_mlp": 1.01866269, + "epoch": 0.707268682739133, + "flos": 15085750256640.0, + "grad_norm": 4.180355208202724, + "language_loss": 0.79438704, + "learning_rate": 8.3339628776301e-07, + "loss": 0.8163504, + "num_input_tokens_seen": 126454010, + "step": 5882, + "time_per_iteration": 3.126051425933838 + }, + { + "auxiliary_loss_clip": 0.01167083, + "auxiliary_loss_mlp": 0.0102498, + "balance_loss_clip": 1.0481497, + "balance_loss_mlp": 1.0180068, + "epoch": 0.7073889256297722, + "flos": 34313148345600.0, + "grad_norm": 1.829479874034662, + "language_loss": 0.57265085, + "learning_rate": 8.327636522858033e-07, + "loss": 0.59457147, + "num_input_tokens_seen": 126473615, + "step": 5883, + "time_per_iteration": 2.517216920852661 + }, + { + "auxiliary_loss_clip": 0.01113286, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.04569483, + "balance_loss_mlp": 1.02137113, + "epoch": 0.7075091685204112, + "flos": 20083940784000.0, + "grad_norm": 1.8732293239577875, + "language_loss": 0.77574593, + "learning_rate": 8.321311938774225e-07, + "loss": 0.79716718, + "num_input_tokens_seen": 126492705, + "step": 5884, + "time_per_iteration": 3.3915157318115234 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.0495218, + "balance_loss_mlp": 1.01899755, + "epoch": 0.7076294114110503, + "flos": 20777124424320.0, + "grad_norm": 1.9646135379962832, + "language_loss": 0.79203027, + "learning_rate": 8.314989126338104e-07, + "loss": 0.81401294, + "num_input_tokens_seen": 126512715, + "step": 5885, + "time_per_iteration": 3.2675116062164307 + }, + { + "auxiliary_loss_clip": 0.01157339, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.04732287, + "balance_loss_mlp": 1.01873934, + "epoch": 0.7077496543016895, + "flos": 17967689141760.0, + "grad_norm": 1.684375479227643, + "language_loss": 0.84470969, + "learning_rate": 8.308668086508847e-07, + "loss": 0.86654294, + "num_input_tokens_seen": 126530795, + "step": 5886, + "time_per_iteration": 2.439687967300415 + }, + { + "auxiliary_loss_clip": 0.01128684, + "auxiliary_loss_mlp": 0.01020038, + "balance_loss_clip": 1.04210651, + "balance_loss_mlp": 1.01254249, + "epoch": 0.7078698971923285, + "flos": 45478098564480.0, + "grad_norm": 1.881127755599909, + "language_loss": 0.73955005, + "learning_rate": 8.302348820245342e-07, + "loss": 0.76103729, + "num_input_tokens_seen": 126553360, + "step": 5887, + "time_per_iteration": 2.731684446334839 + }, + { + "auxiliary_loss_clip": 0.01126637, + "auxiliary_loss_mlp": 0.01025945, + "balance_loss_clip": 1.04322624, + "balance_loss_mlp": 1.01799381, + "epoch": 0.7079901400829676, + "flos": 26943704547840.0, + "grad_norm": 2.409356345252419, + "language_loss": 0.70149148, + "learning_rate": 8.296031328506232e-07, + "loss": 0.72301733, + "num_input_tokens_seen": 126573110, + "step": 5888, + "time_per_iteration": 2.565338134765625 + }, + { + "auxiliary_loss_clip": 0.01141519, + "auxiliary_loss_mlp": 0.01024775, + "balance_loss_clip": 1.0468049, + "balance_loss_mlp": 1.01764059, + "epoch": 0.7081103829736067, + "flos": 24423206267520.0, + "grad_norm": 1.9519650635385772, + "language_loss": 0.75844556, + "learning_rate": 8.289715612249857e-07, + "loss": 0.78010851, + "num_input_tokens_seen": 126593725, + "step": 5889, + "time_per_iteration": 3.256669044494629 + }, + { + "auxiliary_loss_clip": 0.0113752, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.04622734, + "balance_loss_mlp": 1.01879311, + "epoch": 0.7082306258642458, + "flos": 18543300589440.0, + "grad_norm": 2.4845261798980314, + "language_loss": 0.77542293, + "learning_rate": 8.283401672434305e-07, + "loss": 0.79706168, + "num_input_tokens_seen": 126608950, + "step": 5890, + "time_per_iteration": 2.437650203704834 + }, + { + "auxiliary_loss_clip": 0.01138333, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.04798603, + "balance_loss_mlp": 1.01906633, + "epoch": 0.7083508687548848, + "flos": 23477534951040.0, + "grad_norm": 2.413514482741669, + "language_loss": 0.70276546, + "learning_rate": 8.277089510017412e-07, + "loss": 0.72440922, + "num_input_tokens_seen": 126629755, + "step": 5891, + "time_per_iteration": 2.5006752014160156 + }, + { + "auxiliary_loss_clip": 0.011395, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.0491358, + "balance_loss_mlp": 1.01809764, + "epoch": 0.708471111645524, + "flos": 22419463000320.0, + "grad_norm": 5.2911139257654405, + "language_loss": 0.8237235, + "learning_rate": 8.270779125956719e-07, + "loss": 0.84537065, + "num_input_tokens_seen": 126650135, + "step": 5892, + "time_per_iteration": 2.4883430004119873 + }, + { + "auxiliary_loss_clip": 0.01107796, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_clip": 1.04331398, + "balance_loss_mlp": 1.01713359, + "epoch": 0.7085913545361631, + "flos": 20922885815040.0, + "grad_norm": 2.6514971254165056, + "language_loss": 0.80068707, + "learning_rate": 8.264470521209505e-07, + "loss": 0.82200825, + "num_input_tokens_seen": 126668500, + "step": 5893, + "time_per_iteration": 2.5412471294403076 + }, + { + "auxiliary_loss_clip": 0.01145371, + "auxiliary_loss_mlp": 0.01023477, + "balance_loss_clip": 1.04493821, + "balance_loss_mlp": 1.01636338, + "epoch": 0.7087115974268021, + "flos": 15012384727680.0, + "grad_norm": 2.3041720460929405, + "language_loss": 0.76861179, + "learning_rate": 8.258163696732785e-07, + "loss": 0.79030031, + "num_input_tokens_seen": 126686090, + "step": 5894, + "time_per_iteration": 2.422011375427246 + }, + { + "auxiliary_loss_clip": 0.01148852, + "auxiliary_loss_mlp": 0.01024945, + "balance_loss_clip": 1.04607105, + "balance_loss_mlp": 1.01791513, + "epoch": 0.7088318403174413, + "flos": 21539040739200.0, + "grad_norm": 1.857016607301194, + "language_loss": 0.7716186, + "learning_rate": 8.251858653483288e-07, + "loss": 0.7933566, + "num_input_tokens_seen": 126704255, + "step": 5895, + "time_per_iteration": 2.457542657852173 + }, + { + "auxiliary_loss_clip": 0.01154831, + "auxiliary_loss_mlp": 0.01023376, + "balance_loss_clip": 1.04921913, + "balance_loss_mlp": 1.01609182, + "epoch": 0.7089520832080803, + "flos": 15516785462400.0, + "grad_norm": 2.015062678844766, + "language_loss": 0.85957199, + "learning_rate": 8.245555392417501e-07, + "loss": 0.88135409, + "num_input_tokens_seen": 126718910, + "step": 5896, + "time_per_iteration": 2.3973546028137207 + }, + { + "auxiliary_loss_clip": 0.01098049, + "auxiliary_loss_mlp": 0.01021564, + "balance_loss_clip": 1.03941357, + "balance_loss_mlp": 1.01419091, + "epoch": 0.7090723260987194, + "flos": 20412667077120.0, + "grad_norm": 2.6988756889881804, + "language_loss": 0.78762686, + "learning_rate": 8.239253914491613e-07, + "loss": 0.80882299, + "num_input_tokens_seen": 126737235, + "step": 5897, + "time_per_iteration": 2.5295279026031494 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01020553, + "balance_loss_clip": 1.04569817, + "balance_loss_mlp": 1.01364517, + "epoch": 0.7091925689893585, + "flos": 25668337271040.0, + "grad_norm": 1.7954880170059735, + "language_loss": 0.75259316, + "learning_rate": 8.232954220661556e-07, + "loss": 0.77402484, + "num_input_tokens_seen": 126759970, + "step": 5898, + "time_per_iteration": 2.5826380252838135 + }, + { + "auxiliary_loss_clip": 0.01169178, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.05151212, + "balance_loss_mlp": 1.02137613, + "epoch": 0.7093128118799976, + "flos": 24206629213440.0, + "grad_norm": 2.4662082860745635, + "language_loss": 0.70080054, + "learning_rate": 8.226656311882989e-07, + "loss": 0.72277403, + "num_input_tokens_seen": 126779280, + "step": 5899, + "time_per_iteration": 2.4339444637298584 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.04739881, + "balance_loss_mlp": 1.01989055, + "epoch": 0.7094330547706367, + "flos": 16646786398080.0, + "grad_norm": 2.0857110750536445, + "language_loss": 0.77122843, + "learning_rate": 8.22036018911129e-07, + "loss": 0.79300779, + "num_input_tokens_seen": 126797310, + "step": 5900, + "time_per_iteration": 2.435462713241577 + }, + { + "auxiliary_loss_clip": 0.01172454, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.04938328, + "balance_loss_mlp": 1.0207299, + "epoch": 0.7095532976612757, + "flos": 16283370545280.0, + "grad_norm": 2.164938336487819, + "language_loss": 0.80656147, + "learning_rate": 8.214065853301599e-07, + "loss": 0.82856911, + "num_input_tokens_seen": 126812840, + "step": 5901, + "time_per_iteration": 2.388033390045166 + }, + { + "auxiliary_loss_clip": 0.01063006, + "auxiliary_loss_mlp": 0.01003132, + "balance_loss_clip": 1.01616704, + "balance_loss_mlp": 1.00214851, + "epoch": 0.7096735405519149, + "flos": 70722080559360.0, + "grad_norm": 0.8472975926833304, + "language_loss": 0.58255839, + "learning_rate": 8.207773305408734e-07, + "loss": 0.60321987, + "num_input_tokens_seen": 126880060, + "step": 5902, + "time_per_iteration": 3.1554813385009766 + }, + { + "auxiliary_loss_clip": 0.01119159, + "auxiliary_loss_mlp": 0.0102758, + "balance_loss_clip": 1.04238808, + "balance_loss_mlp": 1.0198493, + "epoch": 0.709793783442554, + "flos": 23621500661760.0, + "grad_norm": 1.965219374983623, + "language_loss": 0.79957139, + "learning_rate": 8.201482546387288e-07, + "loss": 0.82103878, + "num_input_tokens_seen": 126899535, + "step": 5903, + "time_per_iteration": 2.6002721786499023 + }, + { + "auxiliary_loss_clip": 0.01153218, + "auxiliary_loss_mlp": 0.0102567, + "balance_loss_clip": 1.04794693, + "balance_loss_mlp": 1.01869631, + "epoch": 0.709914026333193, + "flos": 25993472204160.0, + "grad_norm": 1.6392482300275395, + "language_loss": 0.91766375, + "learning_rate": 8.195193577191553e-07, + "loss": 0.93945265, + "num_input_tokens_seen": 126921365, + "step": 5904, + "time_per_iteration": 2.4943082332611084 + }, + { + "auxiliary_loss_clip": 0.01147404, + "auxiliary_loss_mlp": 0.00761936, + "balance_loss_clip": 1.0465194, + "balance_loss_mlp": 1.0004282, + "epoch": 0.7100342692238322, + "flos": 24861531934080.0, + "grad_norm": 2.201979540955928, + "language_loss": 0.84544945, + "learning_rate": 8.188906398775579e-07, + "loss": 0.86454284, + "num_input_tokens_seen": 126941910, + "step": 5905, + "time_per_iteration": 2.5196001529693604 + }, + { + "auxiliary_loss_clip": 0.01168288, + "auxiliary_loss_mlp": 0.00762281, + "balance_loss_clip": 1.04793715, + "balance_loss_mlp": 1.00042605, + "epoch": 0.7101545121144712, + "flos": 24932203943040.0, + "grad_norm": 1.965040495960102, + "language_loss": 0.69046426, + "learning_rate": 8.18262101209311e-07, + "loss": 0.70976996, + "num_input_tokens_seen": 126961120, + "step": 5906, + "time_per_iteration": 2.4749648571014404 + }, + { + "auxiliary_loss_clip": 0.011577, + "auxiliary_loss_mlp": 0.01022723, + "balance_loss_clip": 1.04731846, + "balance_loss_mlp": 1.0156002, + "epoch": 0.7102747550051103, + "flos": 23768842250880.0, + "grad_norm": 1.7456079028346174, + "language_loss": 0.70122093, + "learning_rate": 8.176337418097626e-07, + "loss": 0.72302514, + "num_input_tokens_seen": 126981590, + "step": 5907, + "time_per_iteration": 2.4929699897766113 + }, + { + "auxiliary_loss_clip": 0.01153564, + "auxiliary_loss_mlp": 0.00761701, + "balance_loss_clip": 1.04880261, + "balance_loss_mlp": 1.00035095, + "epoch": 0.7103949978957494, + "flos": 15303907509120.0, + "grad_norm": 2.6451944249928823, + "language_loss": 0.79995382, + "learning_rate": 8.170055617742364e-07, + "loss": 0.81910646, + "num_input_tokens_seen": 126998870, + "step": 5908, + "time_per_iteration": 2.4380545616149902 + }, + { + "auxiliary_loss_clip": 0.01133699, + "auxiliary_loss_mlp": 0.01030153, + "balance_loss_clip": 1.04359865, + "balance_loss_mlp": 1.02233922, + "epoch": 0.7105152407863885, + "flos": 22638805401600.0, + "grad_norm": 1.769385070674959, + "language_loss": 0.71064436, + "learning_rate": 8.163775611980252e-07, + "loss": 0.73228288, + "num_input_tokens_seen": 127017980, + "step": 5909, + "time_per_iteration": 3.2582204341888428 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01026256, + "balance_loss_clip": 1.04775047, + "balance_loss_mlp": 1.01936603, + "epoch": 0.7106354836770276, + "flos": 17238594879360.0, + "grad_norm": 1.5974972136353895, + "language_loss": 0.78581631, + "learning_rate": 8.157497401763982e-07, + "loss": 0.80748469, + "num_input_tokens_seen": 127035645, + "step": 5910, + "time_per_iteration": 2.4502665996551514 + }, + { + "auxiliary_loss_clip": 0.01151934, + "auxiliary_loss_mlp": 0.01023923, + "balance_loss_clip": 1.04712069, + "balance_loss_mlp": 1.01661563, + "epoch": 0.7107557265676667, + "flos": 20193647898240.0, + "grad_norm": 1.8255890527228482, + "language_loss": 0.78039765, + "learning_rate": 8.151220988045935e-07, + "loss": 0.80215621, + "num_input_tokens_seen": 127054900, + "step": 5911, + "time_per_iteration": 3.2621326446533203 + }, + { + "auxiliary_loss_clip": 0.01153461, + "auxiliary_loss_mlp": 0.01023164, + "balance_loss_clip": 1.04785657, + "balance_loss_mlp": 1.01639557, + "epoch": 0.7108759694583058, + "flos": 21507080613120.0, + "grad_norm": 2.0565607215947623, + "language_loss": 0.82941008, + "learning_rate": 8.144946371778234e-07, + "loss": 0.85117626, + "num_input_tokens_seen": 127075010, + "step": 5912, + "time_per_iteration": 3.3095526695251465 + }, + { + "auxiliary_loss_clip": 0.01140143, + "auxiliary_loss_mlp": 0.00762795, + "balance_loss_clip": 1.04746783, + "balance_loss_mlp": 1.0004282, + "epoch": 0.7109962123489448, + "flos": 24061909317120.0, + "grad_norm": 1.8420633798382486, + "language_loss": 0.78162861, + "learning_rate": 8.138673553912751e-07, + "loss": 0.80065787, + "num_input_tokens_seen": 127095570, + "step": 5913, + "time_per_iteration": 2.4990622997283936 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01023491, + "balance_loss_clip": 1.04306483, + "balance_loss_mlp": 1.01618683, + "epoch": 0.711116455239584, + "flos": 30480474326400.0, + "grad_norm": 10.918322215441364, + "language_loss": 0.57274139, + "learning_rate": 8.132402535401059e-07, + "loss": 0.59408861, + "num_input_tokens_seen": 127116825, + "step": 5914, + "time_per_iteration": 2.619413375854492 + }, + { + "auxiliary_loss_clip": 0.01153458, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.04951155, + "balance_loss_mlp": 1.0201335, + "epoch": 0.711236698130223, + "flos": 25045610158080.0, + "grad_norm": 2.109172351053258, + "language_loss": 0.74345809, + "learning_rate": 8.126133317194465e-07, + "loss": 0.76526886, + "num_input_tokens_seen": 127137015, + "step": 5915, + "time_per_iteration": 3.184783458709717 + }, + { + "auxiliary_loss_clip": 0.01108946, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.04166222, + "balance_loss_mlp": 1.01949489, + "epoch": 0.7113569410208621, + "flos": 24206701040640.0, + "grad_norm": 1.9780733247385713, + "language_loss": 0.74532568, + "learning_rate": 8.11986590024401e-07, + "loss": 0.76668894, + "num_input_tokens_seen": 127156755, + "step": 5916, + "time_per_iteration": 2.5804224014282227 + }, + { + "auxiliary_loss_clip": 0.01147855, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.05148816, + "balance_loss_mlp": 1.01954293, + "epoch": 0.7114771839115013, + "flos": 35439306526080.0, + "grad_norm": 1.876163098924071, + "language_loss": 0.69152963, + "learning_rate": 8.113600285500442e-07, + "loss": 0.71328366, + "num_input_tokens_seen": 127176965, + "step": 5917, + "time_per_iteration": 2.600675106048584 + }, + { + "auxiliary_loss_clip": 0.01168909, + "auxiliary_loss_mlp": 0.01019595, + "balance_loss_clip": 1.04831445, + "balance_loss_mlp": 1.01253796, + "epoch": 0.7115974268021403, + "flos": 21099458096640.0, + "grad_norm": 1.8739102300953667, + "language_loss": 0.74540234, + "learning_rate": 8.107336473914268e-07, + "loss": 0.76728743, + "num_input_tokens_seen": 127195595, + "step": 5918, + "time_per_iteration": 2.414710283279419 + }, + { + "auxiliary_loss_clip": 0.010498, + "auxiliary_loss_mlp": 0.01001163, + "balance_loss_clip": 1.01632953, + "balance_loss_mlp": 1.00010765, + "epoch": 0.7117176696927794, + "flos": 56752866616320.0, + "grad_norm": 0.7701010514575031, + "language_loss": 0.55774122, + "learning_rate": 8.101074466435694e-07, + "loss": 0.57825083, + "num_input_tokens_seen": 127255070, + "step": 5919, + "time_per_iteration": 3.0018229484558105 + }, + { + "auxiliary_loss_clip": 0.01147664, + "auxiliary_loss_mlp": 0.01025741, + "balance_loss_clip": 1.04545951, + "balance_loss_mlp": 1.01873732, + "epoch": 0.7118379125834186, + "flos": 15925269905280.0, + "grad_norm": 1.7143730186604327, + "language_loss": 0.6781137, + "learning_rate": 8.094814264014662e-07, + "loss": 0.69984782, + "num_input_tokens_seen": 127273825, + "step": 5920, + "time_per_iteration": 2.423405885696411 + }, + { + "auxiliary_loss_clip": 0.01170842, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.04891109, + "balance_loss_mlp": 1.02046835, + "epoch": 0.7119581554740576, + "flos": 20193360589440.0, + "grad_norm": 2.06826062705643, + "language_loss": 0.81433731, + "learning_rate": 8.088555867600844e-07, + "loss": 0.83632827, + "num_input_tokens_seen": 127289990, + "step": 5921, + "time_per_iteration": 2.425159454345703 + }, + { + "auxiliary_loss_clip": 0.01124207, + "auxiliary_loss_mlp": 0.01022925, + "balance_loss_clip": 1.0438298, + "balance_loss_mlp": 1.01627922, + "epoch": 0.7120783983646967, + "flos": 34715383822080.0, + "grad_norm": 1.645086391183544, + "language_loss": 0.60422426, + "learning_rate": 8.08229927814362e-07, + "loss": 0.62569559, + "num_input_tokens_seen": 127312880, + "step": 5922, + "time_per_iteration": 2.6355979442596436 + }, + { + "auxiliary_loss_clip": 0.01121827, + "auxiliary_loss_mlp": 0.01022627, + "balance_loss_clip": 1.04188216, + "balance_loss_mlp": 1.01566815, + "epoch": 0.7121986412553358, + "flos": 26359114700160.0, + "grad_norm": 1.6856458353800652, + "language_loss": 0.65223628, + "learning_rate": 8.076044496592134e-07, + "loss": 0.67368084, + "num_input_tokens_seen": 127334730, + "step": 5923, + "time_per_iteration": 2.558905601501465 + }, + { + "auxiliary_loss_clip": 0.01140975, + "auxiliary_loss_mlp": 0.01026422, + "balance_loss_clip": 1.04757428, + "balance_loss_mlp": 1.01953149, + "epoch": 0.7123188841459749, + "flos": 11145344371200.0, + "grad_norm": 2.9472909354827648, + "language_loss": 0.779719, + "learning_rate": 8.069791523895204e-07, + "loss": 0.80139291, + "num_input_tokens_seen": 127351180, + "step": 5924, + "time_per_iteration": 2.444303274154663 + }, + { + "auxiliary_loss_clip": 0.01113495, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.03968561, + "balance_loss_mlp": 1.02003574, + "epoch": 0.7124391270366139, + "flos": 20811670329600.0, + "grad_norm": 2.7339564776865966, + "language_loss": 0.77795053, + "learning_rate": 8.063540361001422e-07, + "loss": 0.79935575, + "num_input_tokens_seen": 127369750, + "step": 5925, + "time_per_iteration": 2.509763240814209 + }, + { + "auxiliary_loss_clip": 0.01120847, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.04367721, + "balance_loss_mlp": 1.0185082, + "epoch": 0.7125593699272531, + "flos": 17603734584960.0, + "grad_norm": 1.8695298956361248, + "language_loss": 0.79380929, + "learning_rate": 8.057291008859069e-07, + "loss": 0.81528217, + "num_input_tokens_seen": 127387910, + "step": 5926, + "time_per_iteration": 2.5028536319732666 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.04630232, + "balance_loss_mlp": 1.02089465, + "epoch": 0.7126796128178922, + "flos": 28654057526400.0, + "grad_norm": 1.892980246859141, + "language_loss": 0.68136835, + "learning_rate": 8.051043468416187e-07, + "loss": 0.70315963, + "num_input_tokens_seen": 127409160, + "step": 5927, + "time_per_iteration": 2.5853114128112793 + }, + { + "auxiliary_loss_clip": 0.01167046, + "auxiliary_loss_mlp": 0.0102243, + "balance_loss_clip": 1.04982388, + "balance_loss_mlp": 1.01541996, + "epoch": 0.7127998557085312, + "flos": 16034438315520.0, + "grad_norm": 1.739864015659535, + "language_loss": 0.82251871, + "learning_rate": 8.044797740620506e-07, + "loss": 0.84441346, + "num_input_tokens_seen": 127427765, + "step": 5928, + "time_per_iteration": 2.4110355377197266 + }, + { + "auxiliary_loss_clip": 0.01107474, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.04434967, + "balance_loss_mlp": 1.02057743, + "epoch": 0.7129200985991703, + "flos": 23403271582080.0, + "grad_norm": 2.3382013158819914, + "language_loss": 0.78935671, + "learning_rate": 8.038553826419494e-07, + "loss": 0.81070387, + "num_input_tokens_seen": 127446475, + "step": 5929, + "time_per_iteration": 2.5456135272979736 + }, + { + "auxiliary_loss_clip": 0.01166213, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.04738116, + "balance_loss_mlp": 1.01466799, + "epoch": 0.7130403414898094, + "flos": 21397445326080.0, + "grad_norm": 1.5925983082648503, + "language_loss": 0.80878288, + "learning_rate": 8.032311726760364e-07, + "loss": 0.83066452, + "num_input_tokens_seen": 127467695, + "step": 5930, + "time_per_iteration": 2.4311695098876953 + }, + { + "auxiliary_loss_clip": 0.01117748, + "auxiliary_loss_mlp": 0.01022347, + "balance_loss_clip": 1.04487777, + "balance_loss_mlp": 1.01435423, + "epoch": 0.7131605843804485, + "flos": 74739045306240.0, + "grad_norm": 1.6652256721273955, + "language_loss": 0.6909281, + "learning_rate": 8.026071442590022e-07, + "loss": 0.71232903, + "num_input_tokens_seen": 127494590, + "step": 5931, + "time_per_iteration": 2.9107630252838135 + }, + { + "auxiliary_loss_clip": 0.01155695, + "auxiliary_loss_mlp": 0.01023063, + "balance_loss_clip": 1.05101502, + "balance_loss_mlp": 1.0163188, + "epoch": 0.7132808272710875, + "flos": 18368739469440.0, + "grad_norm": 1.8645366465907327, + "language_loss": 0.80614018, + "learning_rate": 8.019832974855134e-07, + "loss": 0.82792771, + "num_input_tokens_seen": 127512550, + "step": 5932, + "time_per_iteration": 2.414710760116577 + }, + { + "auxiliary_loss_clip": 0.01124711, + "auxiliary_loss_mlp": 0.01021925, + "balance_loss_clip": 1.04586852, + "balance_loss_mlp": 1.01435828, + "epoch": 0.7134010701617267, + "flos": 23253380127360.0, + "grad_norm": 2.397313765032879, + "language_loss": 0.82587063, + "learning_rate": 8.013596324502052e-07, + "loss": 0.84733701, + "num_input_tokens_seen": 127531015, + "step": 5933, + "time_per_iteration": 2.5310628414154053 + }, + { + "auxiliary_loss_clip": 0.01146503, + "auxiliary_loss_mlp": 0.01022212, + "balance_loss_clip": 1.04686999, + "balance_loss_mlp": 1.01549208, + "epoch": 0.7135213130523658, + "flos": 23653137565440.0, + "grad_norm": 1.6789758185620631, + "language_loss": 0.7860254, + "learning_rate": 8.007361492476872e-07, + "loss": 0.80771255, + "num_input_tokens_seen": 127550340, + "step": 5934, + "time_per_iteration": 2.4680466651916504 + }, + { + "auxiliary_loss_clip": 0.01135552, + "auxiliary_loss_mlp": 0.01025064, + "balance_loss_clip": 1.04570079, + "balance_loss_mlp": 1.0172503, + "epoch": 0.7136415559430048, + "flos": 24790644443520.0, + "grad_norm": 1.626311560765611, + "language_loss": 0.7903989, + "learning_rate": 8.001128479725426e-07, + "loss": 0.81200504, + "num_input_tokens_seen": 127572245, + "step": 5935, + "time_per_iteration": 2.542800188064575 + }, + { + "auxiliary_loss_clip": 0.01102892, + "auxiliary_loss_mlp": 0.01022208, + "balance_loss_clip": 1.03877807, + "balance_loss_mlp": 1.01485837, + "epoch": 0.713761798833644, + "flos": 18296954138880.0, + "grad_norm": 1.6236523823109696, + "language_loss": 0.81287438, + "learning_rate": 7.994897287193248e-07, + "loss": 0.8341254, + "num_input_tokens_seen": 127591625, + "step": 5936, + "time_per_iteration": 3.272777557373047 + }, + { + "auxiliary_loss_clip": 0.01156622, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.04727864, + "balance_loss_mlp": 1.02078366, + "epoch": 0.713882041724283, + "flos": 15558262692480.0, + "grad_norm": 12.020479750260124, + "language_loss": 0.83558595, + "learning_rate": 7.988667915825605e-07, + "loss": 0.8574369, + "num_input_tokens_seen": 127608690, + "step": 5937, + "time_per_iteration": 3.234461545944214 + }, + { + "auxiliary_loss_clip": 0.01139117, + "auxiliary_loss_mlp": 0.01025049, + "balance_loss_clip": 1.04517698, + "balance_loss_mlp": 1.01760483, + "epoch": 0.7140022846149221, + "flos": 24061011477120.0, + "grad_norm": 2.190641409999294, + "language_loss": 0.75256544, + "learning_rate": 7.982440366567491e-07, + "loss": 0.77420712, + "num_input_tokens_seen": 127627180, + "step": 5938, + "time_per_iteration": 2.487715482711792 + }, + { + "auxiliary_loss_clip": 0.01148037, + "auxiliary_loss_mlp": 0.01023534, + "balance_loss_clip": 1.04544067, + "balance_loss_mlp": 1.01628566, + "epoch": 0.7141225275055613, + "flos": 27891710248320.0, + "grad_norm": 1.798389852428495, + "language_loss": 0.75367653, + "learning_rate": 7.97621464036361e-07, + "loss": 0.77539229, + "num_input_tokens_seen": 127648940, + "step": 5939, + "time_per_iteration": 3.3337647914886475 + }, + { + "auxiliary_loss_clip": 0.01156462, + "auxiliary_loss_mlp": 0.01023416, + "balance_loss_clip": 1.04747534, + "balance_loss_mlp": 1.01582873, + "epoch": 0.7142427703962003, + "flos": 19682603147520.0, + "grad_norm": 1.612917090763218, + "language_loss": 0.68116152, + "learning_rate": 7.969990738158417e-07, + "loss": 0.70296025, + "num_input_tokens_seen": 127667350, + "step": 5940, + "time_per_iteration": 2.427645206451416 + }, + { + "auxiliary_loss_clip": 0.0115693, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.04932261, + "balance_loss_mlp": 1.01804423, + "epoch": 0.7143630132868394, + "flos": 21032377447680.0, + "grad_norm": 2.197820580097943, + "language_loss": 0.85094392, + "learning_rate": 7.963768660896062e-07, + "loss": 0.87276906, + "num_input_tokens_seen": 127685760, + "step": 5941, + "time_per_iteration": 3.181696653366089 + }, + { + "auxiliary_loss_clip": 0.01156318, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.04737258, + "balance_loss_mlp": 1.01796341, + "epoch": 0.7144832561774785, + "flos": 24129923719680.0, + "grad_norm": 2.1006198240667793, + "language_loss": 0.82558203, + "learning_rate": 7.957548409520432e-07, + "loss": 0.84740341, + "num_input_tokens_seen": 127704985, + "step": 5942, + "time_per_iteration": 2.446225643157959 + }, + { + "auxiliary_loss_clip": 0.01125794, + "auxiliary_loss_mlp": 0.01020061, + "balance_loss_clip": 1.04293668, + "balance_loss_mlp": 1.01313806, + "epoch": 0.7146034990681176, + "flos": 16325817442560.0, + "grad_norm": 3.5090619472832847, + "language_loss": 0.84121609, + "learning_rate": 7.951329984975135e-07, + "loss": 0.86267465, + "num_input_tokens_seen": 127721925, + "step": 5943, + "time_per_iteration": 2.471788167953491 + }, + { + "auxiliary_loss_clip": 0.01041298, + "auxiliary_loss_mlp": 0.01001847, + "balance_loss_clip": 1.01241088, + "balance_loss_mlp": 1.00081563, + "epoch": 0.7147237419587567, + "flos": 69627164232960.0, + "grad_norm": 0.7138321493804686, + "language_loss": 0.54321826, + "learning_rate": 7.94511338820349e-07, + "loss": 0.56364971, + "num_input_tokens_seen": 127784230, + "step": 5944, + "time_per_iteration": 3.0834860801696777 + }, + { + "auxiliary_loss_clip": 0.01140484, + "auxiliary_loss_mlp": 0.00762358, + "balance_loss_clip": 1.04580259, + "balance_loss_mlp": 1.0003413, + "epoch": 0.7148439848493958, + "flos": 22266806198400.0, + "grad_norm": 2.297531839271508, + "language_loss": 0.78409398, + "learning_rate": 7.938898620148575e-07, + "loss": 0.80312246, + "num_input_tokens_seen": 127801990, + "step": 5945, + "time_per_iteration": 2.484203815460205 + }, + { + "auxiliary_loss_clip": 0.01139539, + "auxiliary_loss_mlp": 0.0102522, + "balance_loss_clip": 1.0464921, + "balance_loss_mlp": 1.01804686, + "epoch": 0.7149642277400349, + "flos": 17931383470080.0, + "grad_norm": 1.9867961615757628, + "language_loss": 0.70704776, + "learning_rate": 7.932685681753135e-07, + "loss": 0.72869533, + "num_input_tokens_seen": 127819270, + "step": 5946, + "time_per_iteration": 2.4448025226593018 + }, + { + "auxiliary_loss_clip": 0.01164645, + "auxiliary_loss_mlp": 0.01021362, + "balance_loss_clip": 1.04844403, + "balance_loss_mlp": 1.01468062, + "epoch": 0.7150844706306739, + "flos": 31681937370240.0, + "grad_norm": 2.025471147651801, + "language_loss": 0.62650669, + "learning_rate": 7.92647457395969e-07, + "loss": 0.64836669, + "num_input_tokens_seen": 127841095, + "step": 5947, + "time_per_iteration": 2.5029704570770264 + }, + { + "auxiliary_loss_clip": 0.01104954, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.03982544, + "balance_loss_mlp": 1.01941872, + "epoch": 0.7152047135213131, + "flos": 10926217451520.0, + "grad_norm": 2.033030334656612, + "language_loss": 0.74778056, + "learning_rate": 7.920265297710444e-07, + "loss": 0.76909918, + "num_input_tokens_seen": 127858485, + "step": 5948, + "time_per_iteration": 2.55680775642395 + }, + { + "auxiliary_loss_clip": 0.01155035, + "auxiliary_loss_mlp": 0.01028357, + "balance_loss_clip": 1.04873919, + "balance_loss_mlp": 1.02104664, + "epoch": 0.7153249564119522, + "flos": 20995640812800.0, + "grad_norm": 1.8020935211084854, + "language_loss": 0.73254424, + "learning_rate": 7.914057853947363e-07, + "loss": 0.75437814, + "num_input_tokens_seen": 127877665, + "step": 5949, + "time_per_iteration": 2.4449634552001953 + }, + { + "auxiliary_loss_clip": 0.01126886, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.04588938, + "balance_loss_mlp": 1.02309573, + "epoch": 0.7154451993025912, + "flos": 24243114453120.0, + "grad_norm": 2.3097265687939466, + "language_loss": 0.62464869, + "learning_rate": 7.907852243612089e-07, + "loss": 0.6462245, + "num_input_tokens_seen": 127898070, + "step": 5950, + "time_per_iteration": 2.5632383823394775 + }, + { + "auxiliary_loss_clip": 0.01137629, + "auxiliary_loss_mlp": 0.01024004, + "balance_loss_clip": 1.04442382, + "balance_loss_mlp": 1.01710463, + "epoch": 0.7155654421932304, + "flos": 23330947547520.0, + "grad_norm": 2.4445920557105323, + "language_loss": 0.72456896, + "learning_rate": 7.901648467646009e-07, + "loss": 0.7461853, + "num_input_tokens_seen": 127917010, + "step": 5951, + "time_per_iteration": 2.4919145107269287 + }, + { + "auxiliary_loss_clip": 0.01170737, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.05073297, + "balance_loss_mlp": 1.02089024, + "epoch": 0.7156856850838694, + "flos": 22711883621760.0, + "grad_norm": 1.513611795855191, + "language_loss": 0.72323102, + "learning_rate": 7.895446526990244e-07, + "loss": 0.74521929, + "num_input_tokens_seen": 127937025, + "step": 5952, + "time_per_iteration": 2.431321859359741 + }, + { + "auxiliary_loss_clip": 0.0112192, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.04544544, + "balance_loss_mlp": 1.01892948, + "epoch": 0.7158059279745085, + "flos": 19865424395520.0, + "grad_norm": 1.4918191436637456, + "language_loss": 0.75417376, + "learning_rate": 7.889246422585609e-07, + "loss": 0.77565682, + "num_input_tokens_seen": 127956410, + "step": 5953, + "time_per_iteration": 2.5446574687957764 + }, + { + "auxiliary_loss_clip": 0.01169615, + "auxiliary_loss_mlp": 0.01022095, + "balance_loss_clip": 1.05043077, + "balance_loss_mlp": 1.01537204, + "epoch": 0.7159261708651476, + "flos": 24134772055680.0, + "grad_norm": 2.0535115772754033, + "language_loss": 0.73640311, + "learning_rate": 7.883048155372675e-07, + "loss": 0.75832021, + "num_input_tokens_seen": 127974925, + "step": 5954, + "time_per_iteration": 2.438392400741577 + }, + { + "auxiliary_loss_clip": 0.0114473, + "auxiliary_loss_mlp": 0.01023021, + "balance_loss_clip": 1.04652762, + "balance_loss_mlp": 1.01592767, + "epoch": 0.7160464137557867, + "flos": 16983198201600.0, + "grad_norm": 2.2442108113923274, + "language_loss": 0.71274406, + "learning_rate": 7.876851726291698e-07, + "loss": 0.73442155, + "num_input_tokens_seen": 127993225, + "step": 5955, + "time_per_iteration": 2.462009906768799 + }, + { + "auxiliary_loss_clip": 0.01129355, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_clip": 1.04402018, + "balance_loss_mlp": 1.01588583, + "epoch": 0.7161666566464258, + "flos": 25228251838080.0, + "grad_norm": 1.9405708162473427, + "language_loss": 0.78466296, + "learning_rate": 7.870657136282666e-07, + "loss": 0.8061868, + "num_input_tokens_seen": 128012085, + "step": 5956, + "time_per_iteration": 2.586334705352783 + }, + { + "auxiliary_loss_clip": 0.0114964, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.04631352, + "balance_loss_mlp": 1.01942682, + "epoch": 0.7162868995370649, + "flos": 26468390851200.0, + "grad_norm": 1.5535642698682754, + "language_loss": 0.81967443, + "learning_rate": 7.86446438628531e-07, + "loss": 0.84143806, + "num_input_tokens_seen": 128033155, + "step": 5957, + "time_per_iteration": 2.480459213256836 + }, + { + "auxiliary_loss_clip": 0.010693, + "auxiliary_loss_mlp": 0.01001776, + "balance_loss_clip": 1.01337409, + "balance_loss_mlp": 1.00078666, + "epoch": 0.716407142427704, + "flos": 69998912040960.0, + "grad_norm": 0.7706519337822517, + "language_loss": 0.56893235, + "learning_rate": 7.858273477239059e-07, + "loss": 0.58964312, + "num_input_tokens_seen": 128101575, + "step": 5958, + "time_per_iteration": 3.028585433959961 + }, + { + "auxiliary_loss_clip": 0.01097445, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.04128027, + "balance_loss_mlp": 1.01806951, + "epoch": 0.716527385318343, + "flos": 20740459616640.0, + "grad_norm": 3.981616231933726, + "language_loss": 0.71342969, + "learning_rate": 7.852084410083067e-07, + "loss": 0.73466116, + "num_input_tokens_seen": 128120395, + "step": 5959, + "time_per_iteration": 2.529221773147583 + }, + { + "auxiliary_loss_clip": 0.01135568, + "auxiliary_loss_mlp": 0.01023634, + "balance_loss_clip": 1.04644775, + "balance_loss_mlp": 1.01680613, + "epoch": 0.7166476282089821, + "flos": 25371966153600.0, + "grad_norm": 1.5823201993328826, + "language_loss": 0.63586152, + "learning_rate": 7.84589718575621e-07, + "loss": 0.6574536, + "num_input_tokens_seen": 128140840, + "step": 5960, + "time_per_iteration": 2.499711513519287 + }, + { + "auxiliary_loss_clip": 0.01139584, + "auxiliary_loss_mlp": 0.01024692, + "balance_loss_clip": 1.04140449, + "balance_loss_mlp": 1.01726508, + "epoch": 0.7167678710996213, + "flos": 24133730561280.0, + "grad_norm": 2.051914502092473, + "language_loss": 0.69125974, + "learning_rate": 7.83971180519708e-07, + "loss": 0.71290255, + "num_input_tokens_seen": 128159695, + "step": 5961, + "time_per_iteration": 2.485414743423462 + }, + { + "auxiliary_loss_clip": 0.01171054, + "auxiliary_loss_mlp": 0.01020219, + "balance_loss_clip": 1.05058181, + "balance_loss_mlp": 1.01269412, + "epoch": 0.7168881139902603, + "flos": 30226586019840.0, + "grad_norm": 1.9783952506312736, + "language_loss": 0.75783879, + "learning_rate": 7.833528269344008e-07, + "loss": 0.77975154, + "num_input_tokens_seen": 128179600, + "step": 5962, + "time_per_iteration": 2.4719698429107666 + }, + { + "auxiliary_loss_clip": 0.01127714, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.0478543, + "balance_loss_mlp": 1.02006364, + "epoch": 0.7170083568808994, + "flos": 14606414236800.0, + "grad_norm": 2.19147052371818, + "language_loss": 0.77354544, + "learning_rate": 7.827346579135023e-07, + "loss": 0.7950995, + "num_input_tokens_seen": 128196940, + "step": 5963, + "time_per_iteration": 3.235485553741455 + }, + { + "auxiliary_loss_clip": 0.01132614, + "auxiliary_loss_mlp": 0.01023625, + "balance_loss_clip": 1.04109097, + "balance_loss_mlp": 1.01607275, + "epoch": 0.7171285997715385, + "flos": 23331091201920.0, + "grad_norm": 1.918318510181046, + "language_loss": 0.83056885, + "learning_rate": 7.821166735507885e-07, + "loss": 0.85213125, + "num_input_tokens_seen": 128215970, + "step": 5964, + "time_per_iteration": 3.3208627700805664 + }, + { + "auxiliary_loss_clip": 0.01166054, + "auxiliary_loss_mlp": 0.0102293, + "balance_loss_clip": 1.04807389, + "balance_loss_mlp": 1.01586342, + "epoch": 0.7172488426621776, + "flos": 16543543731840.0, + "grad_norm": 1.7061114409008664, + "language_loss": 0.68515599, + "learning_rate": 7.81498873940007e-07, + "loss": 0.70704591, + "num_input_tokens_seen": 128233185, + "step": 5965, + "time_per_iteration": 3.24603533744812 + }, + { + "auxiliary_loss_clip": 0.01157576, + "auxiliary_loss_mlp": 0.01020295, + "balance_loss_clip": 1.04477262, + "balance_loss_mlp": 1.01268053, + "epoch": 0.7173690855528166, + "flos": 26541612725760.0, + "grad_norm": 2.198894462808227, + "language_loss": 0.77404654, + "learning_rate": 7.808812591748768e-07, + "loss": 0.79582524, + "num_input_tokens_seen": 128253565, + "step": 5966, + "time_per_iteration": 2.4849202632904053 + }, + { + "auxiliary_loss_clip": 0.01120585, + "auxiliary_loss_mlp": 0.01022783, + "balance_loss_clip": 1.04278624, + "balance_loss_mlp": 1.01538587, + "epoch": 0.7174893284434558, + "flos": 22784099915520.0, + "grad_norm": 1.9995247814891484, + "language_loss": 0.65141243, + "learning_rate": 7.802638293490915e-07, + "loss": 0.67284608, + "num_input_tokens_seen": 128273210, + "step": 5967, + "time_per_iteration": 2.511413812637329 + }, + { + "auxiliary_loss_clip": 0.01141701, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.04485655, + "balance_loss_mlp": 1.01720881, + "epoch": 0.7176095713340949, + "flos": 23293564467840.0, + "grad_norm": 1.607539206124552, + "language_loss": 0.76828629, + "learning_rate": 7.796465845563123e-07, + "loss": 0.78994495, + "num_input_tokens_seen": 128292085, + "step": 5968, + "time_per_iteration": 3.2865138053894043 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.00762008, + "balance_loss_clip": 1.04482985, + "balance_loss_mlp": 1.00039935, + "epoch": 0.7177298142247339, + "flos": 25591631777280.0, + "grad_norm": 1.9185757109855452, + "language_loss": 0.79618633, + "learning_rate": 7.790295248901766e-07, + "loss": 0.81514329, + "num_input_tokens_seen": 128313215, + "step": 5969, + "time_per_iteration": 2.513559103012085 + }, + { + "auxiliary_loss_clip": 0.01154186, + "auxiliary_loss_mlp": 0.01023751, + "balance_loss_clip": 1.04812098, + "balance_loss_mlp": 1.01649463, + "epoch": 0.7178500571153731, + "flos": 31652778504960.0, + "grad_norm": 3.232229592781444, + "language_loss": 0.62354678, + "learning_rate": 7.784126504442902e-07, + "loss": 0.64532614, + "num_input_tokens_seen": 128336445, + "step": 5970, + "time_per_iteration": 2.550555944442749 + }, + { + "auxiliary_loss_clip": 0.01116124, + "auxiliary_loss_mlp": 0.01019245, + "balance_loss_clip": 1.04328299, + "balance_loss_mlp": 1.01193142, + "epoch": 0.7179703000060121, + "flos": 19427242383360.0, + "grad_norm": 1.431572761934135, + "language_loss": 0.67669272, + "learning_rate": 7.777959613122351e-07, + "loss": 0.69804645, + "num_input_tokens_seen": 128356270, + "step": 5971, + "time_per_iteration": 2.496440887451172 + }, + { + "auxiliary_loss_clip": 0.01131966, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.04554319, + "balance_loss_mlp": 1.01872134, + "epoch": 0.7180905428966512, + "flos": 28839249072000.0, + "grad_norm": 1.8246313015915578, + "language_loss": 0.77922994, + "learning_rate": 7.771794575875604e-07, + "loss": 0.80080807, + "num_input_tokens_seen": 128378140, + "step": 5972, + "time_per_iteration": 2.5360546112060547 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.05040479, + "balance_loss_mlp": 1.02143455, + "epoch": 0.7182107857872904, + "flos": 20047563285120.0, + "grad_norm": 2.3693136504902492, + "language_loss": 0.77600729, + "learning_rate": 7.765631393637888e-07, + "loss": 0.79786831, + "num_input_tokens_seen": 128396335, + "step": 5973, + "time_per_iteration": 2.480060577392578 + }, + { + "auxiliary_loss_clip": 0.01148786, + "auxiliary_loss_mlp": 0.01022247, + "balance_loss_clip": 1.04402542, + "balance_loss_mlp": 1.01473045, + "epoch": 0.7183310286779294, + "flos": 22747686503040.0, + "grad_norm": 2.7891524672140107, + "language_loss": 0.48781639, + "learning_rate": 7.75947006734417e-07, + "loss": 0.50952673, + "num_input_tokens_seen": 128414115, + "step": 5974, + "time_per_iteration": 2.4523487091064453 + }, + { + "auxiliary_loss_clip": 0.01166098, + "auxiliary_loss_mlp": 0.01025511, + "balance_loss_clip": 1.04692411, + "balance_loss_mlp": 1.01825094, + "epoch": 0.7184512715685685, + "flos": 17158262112000.0, + "grad_norm": 1.9762087871114307, + "language_loss": 0.82911795, + "learning_rate": 7.753310597929101e-07, + "loss": 0.85103405, + "num_input_tokens_seen": 128430755, + "step": 5975, + "time_per_iteration": 2.363180160522461 + }, + { + "auxiliary_loss_clip": 0.01069045, + "auxiliary_loss_mlp": 0.01001057, + "balance_loss_clip": 1.01330996, + "balance_loss_mlp": 1.00007999, + "epoch": 0.7185715144592076, + "flos": 65509611448320.0, + "grad_norm": 0.7588439268693543, + "language_loss": 0.55146426, + "learning_rate": 7.747152986327095e-07, + "loss": 0.57216531, + "num_input_tokens_seen": 128491300, + "step": 5976, + "time_per_iteration": 2.930744171142578 + }, + { + "auxiliary_loss_clip": 0.01115246, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.04405522, + "balance_loss_mlp": 1.01831925, + "epoch": 0.7186917573498467, + "flos": 16180522928640.0, + "grad_norm": 1.7968503572306591, + "language_loss": 0.68002009, + "learning_rate": 7.740997233472228e-07, + "loss": 0.70142674, + "num_input_tokens_seen": 128508920, + "step": 5977, + "time_per_iteration": 2.4975147247314453 + }, + { + "auxiliary_loss_clip": 0.0113971, + "auxiliary_loss_mlp": 0.01019987, + "balance_loss_clip": 1.04523945, + "balance_loss_mlp": 1.01359463, + "epoch": 0.7188120002404857, + "flos": 29242274647680.0, + "grad_norm": 2.028477166157526, + "language_loss": 0.70545155, + "learning_rate": 7.734843340298329e-07, + "loss": 0.72704852, + "num_input_tokens_seen": 128528745, + "step": 5978, + "time_per_iteration": 2.556546211242676 + }, + { + "auxiliary_loss_clip": 0.01143963, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.04546499, + "balance_loss_mlp": 1.02142322, + "epoch": 0.7189322431311249, + "flos": 33401161008000.0, + "grad_norm": 1.8227730176582633, + "language_loss": 0.7514649, + "learning_rate": 7.72869130773895e-07, + "loss": 0.77319515, + "num_input_tokens_seen": 128549345, + "step": 5979, + "time_per_iteration": 2.580125093460083 + }, + { + "auxiliary_loss_clip": 0.0106055, + "auxiliary_loss_mlp": 0.01001273, + "balance_loss_clip": 1.01362038, + "balance_loss_mlp": 1.00025368, + "epoch": 0.719052486021764, + "flos": 61351263792000.0, + "grad_norm": 0.7841740392847185, + "language_loss": 0.59397566, + "learning_rate": 7.722541136727343e-07, + "loss": 0.61459386, + "num_input_tokens_seen": 128605360, + "step": 5980, + "time_per_iteration": 2.910393238067627 + }, + { + "auxiliary_loss_clip": 0.01153463, + "auxiliary_loss_mlp": 0.0102267, + "balance_loss_clip": 1.0480113, + "balance_loss_mlp": 1.015306, + "epoch": 0.719172728912403, + "flos": 15596795007360.0, + "grad_norm": 4.4258872686997925, + "language_loss": 0.80874044, + "learning_rate": 7.716392828196483e-07, + "loss": 0.83050179, + "num_input_tokens_seen": 128623160, + "step": 5981, + "time_per_iteration": 2.4103119373321533 + }, + { + "auxiliary_loss_clip": 0.01154508, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.04875374, + "balance_loss_mlp": 1.01968312, + "epoch": 0.7192929718030422, + "flos": 15553162961280.0, + "grad_norm": 2.3907294685851266, + "language_loss": 0.76855034, + "learning_rate": 7.710246383079064e-07, + "loss": 0.79036266, + "num_input_tokens_seen": 128638545, + "step": 5982, + "time_per_iteration": 2.41166090965271 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.04382706, + "balance_loss_mlp": 1.01710343, + "epoch": 0.7194132146936812, + "flos": 21862487733120.0, + "grad_norm": 2.7910990426610023, + "language_loss": 0.92155468, + "learning_rate": 7.704101802307492e-07, + "loss": 0.94321406, + "num_input_tokens_seen": 128650845, + "step": 5983, + "time_per_iteration": 2.452523708343506 + }, + { + "auxiliary_loss_clip": 0.01117713, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.04321396, + "balance_loss_mlp": 1.02182698, + "epoch": 0.7195334575843203, + "flos": 27338900958720.0, + "grad_norm": 2.079848905110411, + "language_loss": 0.86884528, + "learning_rate": 7.697959086813912e-07, + "loss": 0.89032108, + "num_input_tokens_seen": 128667010, + "step": 5984, + "time_per_iteration": 2.530970335006714 + }, + { + "auxiliary_loss_clip": 0.01117591, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.04199338, + "balance_loss_mlp": 1.01849818, + "epoch": 0.7196537004749595, + "flos": 18770615809920.0, + "grad_norm": 1.7202020209522562, + "language_loss": 0.79658598, + "learning_rate": 7.691818237530145e-07, + "loss": 0.81801927, + "num_input_tokens_seen": 128685870, + "step": 5985, + "time_per_iteration": 2.4783823490142822 + }, + { + "auxiliary_loss_clip": 0.01124029, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_clip": 1.0443089, + "balance_loss_mlp": 1.0174644, + "epoch": 0.7197739433655985, + "flos": 24531009960960.0, + "grad_norm": 3.347365544878862, + "language_loss": 0.77560627, + "learning_rate": 7.685679255387774e-07, + "loss": 0.79709309, + "num_input_tokens_seen": 128704185, + "step": 5986, + "time_per_iteration": 2.5717742443084717 + }, + { + "auxiliary_loss_clip": 0.01137741, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.04616594, + "balance_loss_mlp": 1.01807249, + "epoch": 0.7198941862562376, + "flos": 18040587793920.0, + "grad_norm": 1.9515849246077535, + "language_loss": 0.7704761, + "learning_rate": 7.679542141318065e-07, + "loss": 0.79210746, + "num_input_tokens_seen": 128721290, + "step": 5987, + "time_per_iteration": 2.460951566696167 + }, + { + "auxiliary_loss_clip": 0.0112759, + "auxiliary_loss_mlp": 0.0102375, + "balance_loss_clip": 1.04280448, + "balance_loss_mlp": 1.01686835, + "epoch": 0.7200144291468767, + "flos": 29022393542400.0, + "grad_norm": 2.985615211994362, + "language_loss": 0.75820959, + "learning_rate": 7.673406896252013e-07, + "loss": 0.77972293, + "num_input_tokens_seen": 128742665, + "step": 5988, + "time_per_iteration": 2.5501222610473633 + }, + { + "auxiliary_loss_clip": 0.01123739, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.04211879, + "balance_loss_mlp": 1.02016938, + "epoch": 0.7201346720375158, + "flos": 25374264624000.0, + "grad_norm": 1.7704955440651207, + "language_loss": 0.7838248, + "learning_rate": 7.667273521120347e-07, + "loss": 0.80534697, + "num_input_tokens_seen": 128762225, + "step": 5989, + "time_per_iteration": 3.335773229598999 + }, + { + "auxiliary_loss_clip": 0.01130774, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.04635978, + "balance_loss_mlp": 1.02109003, + "epoch": 0.7202549149281549, + "flos": 14355614499840.0, + "grad_norm": 1.9877454207093774, + "language_loss": 0.79988301, + "learning_rate": 7.661142016853468e-07, + "loss": 0.82147205, + "num_input_tokens_seen": 128779585, + "step": 5990, + "time_per_iteration": 3.3239893913269043 + }, + { + "auxiliary_loss_clip": 0.01110229, + "auxiliary_loss_mlp": 0.01025868, + "balance_loss_clip": 1.042992, + "balance_loss_mlp": 1.01868272, + "epoch": 0.7203751578187939, + "flos": 23001682550400.0, + "grad_norm": 1.7743199523224717, + "language_loss": 0.74498689, + "learning_rate": 7.655012384381543e-07, + "loss": 0.76634789, + "num_input_tokens_seen": 128799070, + "step": 5991, + "time_per_iteration": 2.561940908432007 + }, + { + "auxiliary_loss_clip": 0.01139675, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.05005169, + "balance_loss_mlp": 1.02133083, + "epoch": 0.7204954007094331, + "flos": 23692424065920.0, + "grad_norm": 1.6525945971799834, + "language_loss": 0.81969774, + "learning_rate": 7.648884624634415e-07, + "loss": 0.84138101, + "num_input_tokens_seen": 128817620, + "step": 5992, + "time_per_iteration": 3.3387362957000732 + }, + { + "auxiliary_loss_clip": 0.01148888, + "auxiliary_loss_mlp": 0.01026357, + "balance_loss_clip": 1.04598236, + "balance_loss_mlp": 1.01917481, + "epoch": 0.7206156436000721, + "flos": 16253026531200.0, + "grad_norm": 1.7731871169080444, + "language_loss": 0.88784838, + "learning_rate": 7.642758738541683e-07, + "loss": 0.90960085, + "num_input_tokens_seen": 128834200, + "step": 5993, + "time_per_iteration": 2.4055371284484863 + }, + { + "auxiliary_loss_clip": 0.01058743, + "auxiliary_loss_mlp": 0.01001506, + "balance_loss_clip": 1.01376402, + "balance_loss_mlp": 1.00054014, + "epoch": 0.7207358864907112, + "flos": 54377806504320.0, + "grad_norm": 0.7572707582816807, + "language_loss": 0.60749763, + "learning_rate": 7.636634727032621e-07, + "loss": 0.62810016, + "num_input_tokens_seen": 128891305, + "step": 5994, + "time_per_iteration": 2.8905177116394043 + }, + { + "auxiliary_loss_clip": 0.01127889, + "auxiliary_loss_mlp": 0.0102503, + "balance_loss_clip": 1.04140258, + "balance_loss_mlp": 1.01720393, + "epoch": 0.7208561293813504, + "flos": 19135540033920.0, + "grad_norm": 1.8721676484559238, + "language_loss": 0.78841472, + "learning_rate": 7.630512591036231e-07, + "loss": 0.80994391, + "num_input_tokens_seen": 128910615, + "step": 5995, + "time_per_iteration": 3.284900188446045 + }, + { + "auxiliary_loss_clip": 0.01157233, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.04973745, + "balance_loss_mlp": 1.0220201, + "epoch": 0.7209763722719894, + "flos": 17748526308480.0, + "grad_norm": 2.3773216578585736, + "language_loss": 0.64676392, + "learning_rate": 7.624392331481255e-07, + "loss": 0.66862893, + "num_input_tokens_seen": 128928270, + "step": 5996, + "time_per_iteration": 2.4248831272125244 + }, + { + "auxiliary_loss_clip": 0.0105985, + "auxiliary_loss_mlp": 0.01001382, + "balance_loss_clip": 1.01456189, + "balance_loss_mlp": 1.00031495, + "epoch": 0.7210966151626285, + "flos": 66819488716800.0, + "grad_norm": 0.7493092913616837, + "language_loss": 0.51836169, + "learning_rate": 7.618273949296115e-07, + "loss": 0.53897393, + "num_input_tokens_seen": 128987780, + "step": 5997, + "time_per_iteration": 2.9322750568389893 + }, + { + "auxiliary_loss_clip": 0.01133361, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.04344153, + "balance_loss_mlp": 1.01722407, + "epoch": 0.7212168580532676, + "flos": 21141869080320.0, + "grad_norm": 1.8981321493140701, + "language_loss": 0.68664795, + "learning_rate": 7.612157445408987e-07, + "loss": 0.70823485, + "num_input_tokens_seen": 129005590, + "step": 5998, + "time_per_iteration": 2.4826321601867676 + }, + { + "auxiliary_loss_clip": 0.01148237, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.05186796, + "balance_loss_mlp": 1.02081037, + "epoch": 0.7213371009439067, + "flos": 22345738335360.0, + "grad_norm": 2.226406481358763, + "language_loss": 0.74367869, + "learning_rate": 7.606042820747716e-07, + "loss": 0.76544499, + "num_input_tokens_seen": 129021995, + "step": 5999, + "time_per_iteration": 2.4945120811462402 + }, + { + "auxiliary_loss_clip": 0.01146962, + "auxiliary_loss_mlp": 0.01024081, + "balance_loss_clip": 1.05046725, + "balance_loss_mlp": 1.01663053, + "epoch": 0.7214573438345457, + "flos": 18515901490560.0, + "grad_norm": 2.1359100509397706, + "language_loss": 0.85404289, + "learning_rate": 7.599930076239889e-07, + "loss": 0.87575334, + "num_input_tokens_seen": 129039280, + "step": 6000, + "time_per_iteration": 2.472728967666626 + }, + { + "auxiliary_loss_clip": 0.01115423, + "auxiliary_loss_mlp": 0.00761979, + "balance_loss_clip": 1.04511642, + "balance_loss_mlp": 1.00037766, + "epoch": 0.7215775867251849, + "flos": 35736108606720.0, + "grad_norm": 2.006742038219633, + "language_loss": 0.70400584, + "learning_rate": 7.593819212812818e-07, + "loss": 0.72277993, + "num_input_tokens_seen": 129060860, + "step": 6001, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.01153777, + "auxiliary_loss_mlp": 0.01022394, + "balance_loss_clip": 1.04854417, + "balance_loss_mlp": 1.01536119, + "epoch": 0.721697829615824, + "flos": 20372410909440.0, + "grad_norm": 5.027456021302868, + "language_loss": 0.71434486, + "learning_rate": 7.587710231393508e-07, + "loss": 0.73610651, + "num_input_tokens_seen": 129079215, + "step": 6002, + "time_per_iteration": 2.4332945346832275 + }, + { + "auxiliary_loss_clip": 0.01071424, + "auxiliary_loss_mlp": 0.01021402, + "balance_loss_clip": 1.03706491, + "balance_loss_mlp": 1.01458025, + "epoch": 0.721818072506463, + "flos": 20229809915520.0, + "grad_norm": 1.909431944797739, + "language_loss": 0.8379997, + "learning_rate": 7.581603132908685e-07, + "loss": 0.85892797, + "num_input_tokens_seen": 129097185, + "step": 6003, + "time_per_iteration": 2.5940792560577393 + }, + { + "auxiliary_loss_clip": 0.01121517, + "auxiliary_loss_mlp": 0.01022629, + "balance_loss_clip": 1.04460168, + "balance_loss_mlp": 1.01512146, + "epoch": 0.7219383153971022, + "flos": 18186887888640.0, + "grad_norm": 1.9426869239432827, + "language_loss": 0.78648496, + "learning_rate": 7.575497918284795e-07, + "loss": 0.80792642, + "num_input_tokens_seen": 129114730, + "step": 6004, + "time_per_iteration": 2.501044511795044 + }, + { + "auxiliary_loss_clip": 0.01171574, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.04900825, + "balance_loss_mlp": 1.02178466, + "epoch": 0.7220585582877412, + "flos": 17342124854400.0, + "grad_norm": 1.9389326334801453, + "language_loss": 0.74713039, + "learning_rate": 7.569394588447984e-07, + "loss": 0.76913851, + "num_input_tokens_seen": 129131745, + "step": 6005, + "time_per_iteration": 2.451655387878418 + }, + { + "auxiliary_loss_clip": 0.01145827, + "auxiliary_loss_mlp": 0.01025989, + "balance_loss_clip": 1.04499614, + "balance_loss_mlp": 1.01891947, + "epoch": 0.7221788011783803, + "flos": 16976338704000.0, + "grad_norm": 2.1875925920998402, + "language_loss": 0.77720118, + "learning_rate": 7.563293144324146e-07, + "loss": 0.79891932, + "num_input_tokens_seen": 129147295, + "step": 6006, + "time_per_iteration": 2.4157588481903076 + }, + { + "auxiliary_loss_clip": 0.01168018, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.05020463, + "balance_loss_mlp": 1.01928282, + "epoch": 0.7222990440690195, + "flos": 26286359702400.0, + "grad_norm": 1.7745956050819711, + "language_loss": 0.80347037, + "learning_rate": 7.557193586838834e-07, + "loss": 0.82541227, + "num_input_tokens_seen": 129162660, + "step": 6007, + "time_per_iteration": 2.452289581298828 + }, + { + "auxiliary_loss_clip": 0.01144258, + "auxiliary_loss_mlp": 0.01024422, + "balance_loss_clip": 1.04560399, + "balance_loss_mlp": 1.0174005, + "epoch": 0.7224192869596585, + "flos": 17601687509760.0, + "grad_norm": 2.063032941343346, + "language_loss": 0.70465046, + "learning_rate": 7.551095916917371e-07, + "loss": 0.72633725, + "num_input_tokens_seen": 129179990, + "step": 6008, + "time_per_iteration": 2.4540679454803467 + }, + { + "auxiliary_loss_clip": 0.01137213, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.04519868, + "balance_loss_mlp": 1.01980138, + "epoch": 0.7225395298502976, + "flos": 12932331016320.0, + "grad_norm": 3.9488347740190664, + "language_loss": 0.66441917, + "learning_rate": 7.545000135484758e-07, + "loss": 0.68606889, + "num_input_tokens_seen": 129197425, + "step": 6009, + "time_per_iteration": 2.4877817630767822 + }, + { + "auxiliary_loss_clip": 0.01169243, + "auxiliary_loss_mlp": 0.00761983, + "balance_loss_clip": 1.05022001, + "balance_loss_mlp": 1.0003767, + "epoch": 0.7226597727409367, + "flos": 29643899592960.0, + "grad_norm": 4.110110059471532, + "language_loss": 0.62876147, + "learning_rate": 7.538906243465714e-07, + "loss": 0.64807373, + "num_input_tokens_seen": 129217560, + "step": 6010, + "time_per_iteration": 2.475687265396118 + }, + { + "auxiliary_loss_clip": 0.01170204, + "auxiliary_loss_mlp": 0.01024402, + "balance_loss_clip": 1.05039358, + "balance_loss_mlp": 1.01715159, + "epoch": 0.7227800156315758, + "flos": 13771635183360.0, + "grad_norm": 2.1839692433528075, + "language_loss": 0.78865159, + "learning_rate": 7.5328142417847e-07, + "loss": 0.81059766, + "num_input_tokens_seen": 129234325, + "step": 6011, + "time_per_iteration": 2.3999295234680176 + }, + { + "auxiliary_loss_clip": 0.0114998, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.04466867, + "balance_loss_mlp": 1.02412021, + "epoch": 0.7229002585222148, + "flos": 20301882554880.0, + "grad_norm": 1.7848167117486438, + "language_loss": 0.69578463, + "learning_rate": 7.526724131365838e-07, + "loss": 0.71759224, + "num_input_tokens_seen": 129255280, + "step": 6012, + "time_per_iteration": 2.4556541442871094 + }, + { + "auxiliary_loss_clip": 0.01138734, + "auxiliary_loss_mlp": 0.01028338, + "balance_loss_clip": 1.04858291, + "balance_loss_mlp": 1.02079248, + "epoch": 0.723020501412854, + "flos": 16581250033920.0, + "grad_norm": 1.9976839115489198, + "language_loss": 0.70107841, + "learning_rate": 7.520635913133017e-07, + "loss": 0.72274917, + "num_input_tokens_seen": 129273910, + "step": 6013, + "time_per_iteration": 2.4425346851348877 + }, + { + "auxiliary_loss_clip": 0.01159931, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.04900575, + "balance_loss_mlp": 1.0204854, + "epoch": 0.7231407443034931, + "flos": 28548300908160.0, + "grad_norm": 1.8870891315437903, + "language_loss": 0.82634008, + "learning_rate": 7.514549588009798e-07, + "loss": 0.84822559, + "num_input_tokens_seen": 129294785, + "step": 6014, + "time_per_iteration": 2.4922797679901123 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01025867, + "balance_loss_clip": 1.04628873, + "balance_loss_mlp": 1.01902401, + "epoch": 0.7232609871941321, + "flos": 30008536508160.0, + "grad_norm": 3.5782944014282987, + "language_loss": 0.70600319, + "learning_rate": 7.508465156919492e-07, + "loss": 0.72767979, + "num_input_tokens_seen": 129318295, + "step": 6015, + "time_per_iteration": 2.5513219833374023 + }, + { + "auxiliary_loss_clip": 0.01141146, + "auxiliary_loss_mlp": 0.01027078, + "balance_loss_clip": 1.04583597, + "balance_loss_mlp": 1.01952028, + "epoch": 0.7233812300847713, + "flos": 16654005031680.0, + "grad_norm": 2.3502057885378798, + "language_loss": 0.61568433, + "learning_rate": 7.502382620785083e-07, + "loss": 0.63736653, + "num_input_tokens_seen": 129334845, + "step": 6016, + "time_per_iteration": 3.2132537364959717 + }, + { + "auxiliary_loss_clip": 0.01028106, + "auxiliary_loss_mlp": 0.01004215, + "balance_loss_clip": 1.01288819, + "balance_loss_mlp": 1.00329721, + "epoch": 0.7235014729754103, + "flos": 67258784050560.0, + "grad_norm": 0.8044529059662425, + "language_loss": 0.62498724, + "learning_rate": 7.496301980529289e-07, + "loss": 0.64531052, + "num_input_tokens_seen": 129398055, + "step": 6017, + "time_per_iteration": 3.9700419902801514 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.0501442, + "balance_loss_mlp": 1.01863289, + "epoch": 0.7236217158660494, + "flos": 26943237671040.0, + "grad_norm": 5.784379325592039, + "language_loss": 0.74407017, + "learning_rate": 7.490223237074547e-07, + "loss": 0.76603234, + "num_input_tokens_seen": 129417765, + "step": 6018, + "time_per_iteration": 2.4488601684570312 + }, + { + "auxiliary_loss_clip": 0.01124949, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.04261518, + "balance_loss_mlp": 1.01899862, + "epoch": 0.7237419587566886, + "flos": 29423372042880.0, + "grad_norm": 2.0386632418468675, + "language_loss": 0.65892655, + "learning_rate": 7.484146391342989e-07, + "loss": 0.68044281, + "num_input_tokens_seen": 129437560, + "step": 6019, + "time_per_iteration": 3.402531862258911 + }, + { + "auxiliary_loss_clip": 0.01133786, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.04389572, + "balance_loss_mlp": 1.01911151, + "epoch": 0.7238622016473276, + "flos": 17821496787840.0, + "grad_norm": 3.1553578170541843, + "language_loss": 0.56824768, + "learning_rate": 7.478071444256484e-07, + "loss": 0.58984864, + "num_input_tokens_seen": 129455320, + "step": 6020, + "time_per_iteration": 2.4340176582336426 + }, + { + "auxiliary_loss_clip": 0.01136763, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.04581952, + "balance_loss_mlp": 1.01824927, + "epoch": 0.7239824445379667, + "flos": 25739117020800.0, + "grad_norm": 1.669516718253167, + "language_loss": 0.79221809, + "learning_rate": 7.471998396736579e-07, + "loss": 0.8138411, + "num_input_tokens_seen": 129475700, + "step": 6021, + "time_per_iteration": 3.297128677368164 + }, + { + "auxiliary_loss_clip": 0.01130012, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.04740191, + "balance_loss_mlp": 1.01543784, + "epoch": 0.7241026874286057, + "flos": 23148916398720.0, + "grad_norm": 1.6938870151479555, + "language_loss": 0.75689989, + "learning_rate": 7.465927249704549e-07, + "loss": 0.77842486, + "num_input_tokens_seen": 129493585, + "step": 6022, + "time_per_iteration": 2.5260355472564697 + }, + { + "auxiliary_loss_clip": 0.01153484, + "auxiliary_loss_mlp": 0.01024684, + "balance_loss_clip": 1.04785001, + "balance_loss_mlp": 1.01752806, + "epoch": 0.7242229303192449, + "flos": 20266905686400.0, + "grad_norm": 2.010529393297914, + "language_loss": 0.77115488, + "learning_rate": 7.459858004081398e-07, + "loss": 0.79293656, + "num_input_tokens_seen": 129511555, + "step": 6023, + "time_per_iteration": 2.4605953693389893 + }, + { + "auxiliary_loss_clip": 0.01029351, + "auxiliary_loss_mlp": 0.01003216, + "balance_loss_clip": 1.01182222, + "balance_loss_mlp": 1.00210714, + "epoch": 0.724343173209884, + "flos": 62311659684480.0, + "grad_norm": 0.6565992248594862, + "language_loss": 0.58060646, + "learning_rate": 7.453790660787815e-07, + "loss": 0.60093212, + "num_input_tokens_seen": 129579650, + "step": 6024, + "time_per_iteration": 3.1811561584472656 + }, + { + "auxiliary_loss_clip": 0.01142533, + "auxiliary_loss_mlp": 0.0102236, + "balance_loss_clip": 1.04668736, + "balance_loss_mlp": 1.01466513, + "epoch": 0.724463416100523, + "flos": 35006403813120.0, + "grad_norm": 2.487729595746856, + "language_loss": 0.63646984, + "learning_rate": 7.447725220744214e-07, + "loss": 0.65811872, + "num_input_tokens_seen": 129601895, + "step": 6025, + "time_per_iteration": 2.607048988342285 + }, + { + "auxiliary_loss_clip": 0.01168389, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.04786289, + "balance_loss_mlp": 1.02077329, + "epoch": 0.7245836589911622, + "flos": 21871968923520.0, + "grad_norm": 2.191457515657719, + "language_loss": 0.77638066, + "learning_rate": 7.441661684870717e-07, + "loss": 0.79834729, + "num_input_tokens_seen": 129622150, + "step": 6026, + "time_per_iteration": 2.4498631954193115 + }, + { + "auxiliary_loss_clip": 0.01169349, + "auxiliary_loss_mlp": 0.01020306, + "balance_loss_clip": 1.0504781, + "balance_loss_mlp": 1.01339173, + "epoch": 0.7247039018818012, + "flos": 23006494972800.0, + "grad_norm": 1.6908496084677738, + "language_loss": 0.81659722, + "learning_rate": 7.435600054087152e-07, + "loss": 0.8384937, + "num_input_tokens_seen": 129644315, + "step": 6027, + "time_per_iteration": 2.456315279006958 + }, + { + "auxiliary_loss_clip": 0.01172203, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.05223274, + "balance_loss_mlp": 1.02198195, + "epoch": 0.7248241447724403, + "flos": 31722588587520.0, + "grad_norm": 1.7903916572246275, + "language_loss": 0.74406439, + "learning_rate": 7.42954032931308e-07, + "loss": 0.76607919, + "num_input_tokens_seen": 129665355, + "step": 6028, + "time_per_iteration": 2.5084571838378906 + }, + { + "auxiliary_loss_clip": 0.01140779, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.04539967, + "balance_loss_mlp": 1.01845133, + "epoch": 0.7249443876630794, + "flos": 34896984007680.0, + "grad_norm": 1.8133674041723753, + "language_loss": 0.74583489, + "learning_rate": 7.423482511467733e-07, + "loss": 0.76749599, + "num_input_tokens_seen": 129686125, + "step": 6029, + "time_per_iteration": 2.6122748851776123 + }, + { + "auxiliary_loss_clip": 0.01087051, + "auxiliary_loss_mlp": 0.0102633, + "balance_loss_clip": 1.04073763, + "balance_loss_mlp": 1.01939464, + "epoch": 0.7250646305537185, + "flos": 26359294268160.0, + "grad_norm": 2.028596163551907, + "language_loss": 0.64529043, + "learning_rate": 7.417426601470099e-07, + "loss": 0.66642416, + "num_input_tokens_seen": 129706485, + "step": 6030, + "time_per_iteration": 2.619476795196533 + }, + { + "auxiliary_loss_clip": 0.01156154, + "auxiliary_loss_mlp": 0.01025234, + "balance_loss_clip": 1.04832625, + "balance_loss_mlp": 1.01741385, + "epoch": 0.7251848734443576, + "flos": 30081614728320.0, + "grad_norm": 2.068492348662004, + "language_loss": 0.78497219, + "learning_rate": 7.411372600238841e-07, + "loss": 0.80678606, + "num_input_tokens_seen": 129727100, + "step": 6031, + "time_per_iteration": 2.520246982574463 + }, + { + "auxiliary_loss_clip": 0.01168934, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.04935288, + "balance_loss_mlp": 1.02019346, + "epoch": 0.7253051163349967, + "flos": 17785262943360.0, + "grad_norm": 1.9766609161773634, + "language_loss": 0.73686266, + "learning_rate": 7.405320508692346e-07, + "loss": 0.75882709, + "num_input_tokens_seen": 129745840, + "step": 6032, + "time_per_iteration": 2.3928418159484863 + }, + { + "auxiliary_loss_clip": 0.01165948, + "auxiliary_loss_mlp": 0.01021786, + "balance_loss_clip": 1.04986668, + "balance_loss_mlp": 1.01487768, + "epoch": 0.7254253592256358, + "flos": 12641346938880.0, + "grad_norm": 1.7691461649455498, + "language_loss": 0.75384581, + "learning_rate": 7.399270327748727e-07, + "loss": 0.77572316, + "num_input_tokens_seen": 129763500, + "step": 6033, + "time_per_iteration": 2.407437801361084 + }, + { + "auxiliary_loss_clip": 0.01126669, + "auxiliary_loss_mlp": 0.00760791, + "balance_loss_clip": 1.04334259, + "balance_loss_mlp": 1.00032187, + "epoch": 0.7255456021162748, + "flos": 27199208966400.0, + "grad_norm": 1.8752201488629237, + "language_loss": 0.74161017, + "learning_rate": 7.39322205832577e-07, + "loss": 0.76048476, + "num_input_tokens_seen": 129784390, + "step": 6034, + "time_per_iteration": 2.5662126541137695 + }, + { + "auxiliary_loss_clip": 0.01136063, + "auxiliary_loss_mlp": 0.01021565, + "balance_loss_clip": 1.04540586, + "balance_loss_mlp": 1.01435924, + "epoch": 0.725665845006914, + "flos": 21288205088640.0, + "grad_norm": 1.9693143105615372, + "language_loss": 0.80809402, + "learning_rate": 7.387175701341009e-07, + "loss": 0.82967037, + "num_input_tokens_seen": 129803060, + "step": 6035, + "time_per_iteration": 2.4947454929351807 + }, + { + "auxiliary_loss_clip": 0.01153514, + "auxiliary_loss_mlp": 0.01022176, + "balance_loss_clip": 1.04659915, + "balance_loss_mlp": 1.01502991, + "epoch": 0.7257860878975531, + "flos": 16033684129920.0, + "grad_norm": 10.596058797726988, + "language_loss": 0.72169155, + "learning_rate": 7.381131257711659e-07, + "loss": 0.7434485, + "num_input_tokens_seen": 129820165, + "step": 6036, + "time_per_iteration": 2.428905725479126 + }, + { + "auxiliary_loss_clip": 0.01140422, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.05238068, + "balance_loss_mlp": 1.0189997, + "epoch": 0.7259063307881921, + "flos": 12129943052160.0, + "grad_norm": 1.7709084545011553, + "language_loss": 0.83624882, + "learning_rate": 7.375088728354677e-07, + "loss": 0.85791272, + "num_input_tokens_seen": 129835195, + "step": 6037, + "time_per_iteration": 2.4450795650482178 + }, + { + "auxiliary_loss_clip": 0.01129009, + "auxiliary_loss_mlp": 0.01022151, + "balance_loss_clip": 1.04504681, + "balance_loss_mlp": 1.01491451, + "epoch": 0.7260265736788313, + "flos": 30443845432320.0, + "grad_norm": 1.6581015970387019, + "language_loss": 0.67050523, + "learning_rate": 7.369048114186691e-07, + "loss": 0.69201684, + "num_input_tokens_seen": 129856240, + "step": 6038, + "time_per_iteration": 2.5816047191619873 + }, + { + "auxiliary_loss_clip": 0.01135205, + "auxiliary_loss_mlp": 0.00761333, + "balance_loss_clip": 1.04711318, + "balance_loss_mlp": 1.00029314, + "epoch": 0.7261468165694703, + "flos": 21142264129920.0, + "grad_norm": 1.7516089499832659, + "language_loss": 0.83453536, + "learning_rate": 7.363009416124055e-07, + "loss": 0.85350072, + "num_input_tokens_seen": 129875565, + "step": 6039, + "time_per_iteration": 2.508725166320801 + }, + { + "auxiliary_loss_clip": 0.01130544, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.04731607, + "balance_loss_mlp": 1.01664138, + "epoch": 0.7262670594601094, + "flos": 22306308180480.0, + "grad_norm": 2.158664072081652, + "language_loss": 0.62693131, + "learning_rate": 7.356972635082852e-07, + "loss": 0.6484766, + "num_input_tokens_seen": 129894420, + "step": 6040, + "time_per_iteration": 2.508397340774536 + }, + { + "auxiliary_loss_clip": 0.01112205, + "auxiliary_loss_mlp": 0.01026033, + "balance_loss_clip": 1.04809213, + "balance_loss_mlp": 1.01895809, + "epoch": 0.7263873023507486, + "flos": 25335049950720.0, + "grad_norm": 1.8823020402572586, + "language_loss": 0.75217694, + "learning_rate": 7.35093777197884e-07, + "loss": 0.77355933, + "num_input_tokens_seen": 129914490, + "step": 6041, + "time_per_iteration": 2.575162887573242 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01020626, + "balance_loss_clip": 1.04828906, + "balance_loss_mlp": 1.01376522, + "epoch": 0.7265075452413876, + "flos": 23878621192320.0, + "grad_norm": 2.175094438950915, + "language_loss": 0.86124527, + "learning_rate": 7.344904827727525e-07, + "loss": 0.88285047, + "num_input_tokens_seen": 129931670, + "step": 6042, + "time_per_iteration": 2.4986205101013184 + }, + { + "auxiliary_loss_clip": 0.0112713, + "auxiliary_loss_mlp": 0.01023941, + "balance_loss_clip": 1.04285884, + "balance_loss_mlp": 1.01655054, + "epoch": 0.7266277881320267, + "flos": 28724549967360.0, + "grad_norm": 2.993181674063456, + "language_loss": 0.73743117, + "learning_rate": 7.338873803244076e-07, + "loss": 0.75894189, + "num_input_tokens_seen": 129946905, + "step": 6043, + "time_per_iteration": 3.282459020614624 + }, + { + "auxiliary_loss_clip": 0.01135136, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.04605055, + "balance_loss_mlp": 1.01845217, + "epoch": 0.7267480310226658, + "flos": 24863507182080.0, + "grad_norm": 1.7229417007540275, + "language_loss": 0.80872029, + "learning_rate": 7.332844699443401e-07, + "loss": 0.83032417, + "num_input_tokens_seen": 129965505, + "step": 6044, + "time_per_iteration": 3.36826491355896 + }, + { + "auxiliary_loss_clip": 0.01102204, + "auxiliary_loss_mlp": 0.01024867, + "balance_loss_clip": 1.04048014, + "balance_loss_mlp": 1.01815832, + "epoch": 0.7268682739133049, + "flos": 27198490694400.0, + "grad_norm": 1.9023147716971842, + "language_loss": 0.75172096, + "learning_rate": 7.326817517240121e-07, + "loss": 0.77299166, + "num_input_tokens_seen": 129987210, + "step": 6045, + "time_per_iteration": 2.589280366897583 + }, + { + "auxiliary_loss_clip": 0.01154823, + "auxiliary_loss_mlp": 0.00760961, + "balance_loss_clip": 1.0481807, + "balance_loss_mlp": 1.00028038, + "epoch": 0.7269885168039439, + "flos": 33508138688640.0, + "grad_norm": 2.189282586823286, + "language_loss": 0.83345866, + "learning_rate": 7.320792257548545e-07, + "loss": 0.85261655, + "num_input_tokens_seen": 130008385, + "step": 6046, + "time_per_iteration": 3.3918864727020264 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01022623, + "balance_loss_clip": 1.04714429, + "balance_loss_mlp": 1.0151602, + "epoch": 0.7271087596945831, + "flos": 24313750548480.0, + "grad_norm": 1.8360867878056761, + "language_loss": 0.76349485, + "learning_rate": 7.314768921282704e-07, + "loss": 0.78516841, + "num_input_tokens_seen": 130029040, + "step": 6047, + "time_per_iteration": 2.504153251647949 + }, + { + "auxiliary_loss_clip": 0.01156181, + "auxiliary_loss_mlp": 0.01023949, + "balance_loss_clip": 1.04748607, + "balance_loss_mlp": 1.01679075, + "epoch": 0.7272290025852222, + "flos": 23805147922560.0, + "grad_norm": 2.825462046756991, + "language_loss": 0.72337544, + "learning_rate": 7.30874750935633e-07, + "loss": 0.74517673, + "num_input_tokens_seen": 130048725, + "step": 6048, + "time_per_iteration": 3.2359619140625 + }, + { + "auxiliary_loss_clip": 0.01126845, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.04684186, + "balance_loss_mlp": 1.01652217, + "epoch": 0.7273492454758612, + "flos": 16720367408640.0, + "grad_norm": 4.23380308243205, + "language_loss": 0.7874791, + "learning_rate": 7.30272802268286e-07, + "loss": 0.8089841, + "num_input_tokens_seen": 130065720, + "step": 6049, + "time_per_iteration": 2.487602949142456 + }, + { + "auxiliary_loss_clip": 0.01071103, + "auxiliary_loss_mlp": 0.01021433, + "balance_loss_clip": 1.03615773, + "balance_loss_mlp": 1.01452446, + "epoch": 0.7274694883665004, + "flos": 28031330413440.0, + "grad_norm": 2.808644567431553, + "language_loss": 0.7596457, + "learning_rate": 7.29671046217547e-07, + "loss": 0.78057104, + "num_input_tokens_seen": 130084830, + "step": 6050, + "time_per_iteration": 2.5993053913116455 + }, + { + "auxiliary_loss_clip": 0.01128471, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.0463599, + "balance_loss_mlp": 1.02125168, + "epoch": 0.7275897312571394, + "flos": 30372706546560.0, + "grad_norm": 1.825345955677869, + "language_loss": 0.81345701, + "learning_rate": 7.290694828746988e-07, + "loss": 0.83502495, + "num_input_tokens_seen": 130104495, + "step": 6051, + "time_per_iteration": 2.577407121658325 + }, + { + "auxiliary_loss_clip": 0.01129597, + "auxiliary_loss_mlp": 0.01024737, + "balance_loss_clip": 1.04344559, + "balance_loss_mlp": 1.01746202, + "epoch": 0.7277099741477785, + "flos": 19204775498880.0, + "grad_norm": 1.964649200838904, + "language_loss": 0.85851324, + "learning_rate": 7.284681123310004e-07, + "loss": 0.88005662, + "num_input_tokens_seen": 130123210, + "step": 6052, + "time_per_iteration": 2.51304030418396 + }, + { + "auxiliary_loss_clip": 0.01152796, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.04791343, + "balance_loss_mlp": 1.02045512, + "epoch": 0.7278302170384175, + "flos": 20667884186880.0, + "grad_norm": 2.98817717559006, + "language_loss": 0.79554474, + "learning_rate": 7.27866934677678e-07, + "loss": 0.81735516, + "num_input_tokens_seen": 130142880, + "step": 6053, + "time_per_iteration": 2.4361069202423096 + }, + { + "auxiliary_loss_clip": 0.01108135, + "auxiliary_loss_mlp": 0.01022581, + "balance_loss_clip": 1.04340911, + "balance_loss_mlp": 1.01555705, + "epoch": 0.7279504599290567, + "flos": 19093200877440.0, + "grad_norm": 1.6930859080041893, + "language_loss": 0.78456926, + "learning_rate": 7.272659500059297e-07, + "loss": 0.80587649, + "num_input_tokens_seen": 130160220, + "step": 6054, + "time_per_iteration": 2.5480964183807373 + }, + { + "auxiliary_loss_clip": 0.0114776, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.04690599, + "balance_loss_mlp": 1.02736211, + "epoch": 0.7280707028196958, + "flos": 19062174504960.0, + "grad_norm": 2.661770738953473, + "language_loss": 0.80178165, + "learning_rate": 7.266651584069264e-07, + "loss": 0.82360345, + "num_input_tokens_seen": 130177885, + "step": 6055, + "time_per_iteration": 2.451216459274292 + }, + { + "auxiliary_loss_clip": 0.01157912, + "auxiliary_loss_mlp": 0.01022527, + "balance_loss_clip": 1.0509063, + "balance_loss_mlp": 1.01553559, + "epoch": 0.7281909457103348, + "flos": 37196308293120.0, + "grad_norm": 1.636765370791178, + "language_loss": 0.57367694, + "learning_rate": 7.260645599718045e-07, + "loss": 0.59548134, + "num_input_tokens_seen": 130204240, + "step": 6056, + "time_per_iteration": 2.5949149131774902 + }, + { + "auxiliary_loss_clip": 0.01139184, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.04522789, + "balance_loss_mlp": 1.02120817, + "epoch": 0.728311188600974, + "flos": 20667094087680.0, + "grad_norm": 7.417084390698381, + "language_loss": 0.674986, + "learning_rate": 7.254641547916767e-07, + "loss": 0.69666833, + "num_input_tokens_seen": 130221735, + "step": 6057, + "time_per_iteration": 2.465744733810425 + }, + { + "auxiliary_loss_clip": 0.01168454, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.05055022, + "balance_loss_mlp": 1.02382374, + "epoch": 0.728431431491613, + "flos": 28840685616000.0, + "grad_norm": 2.034867113792075, + "language_loss": 0.69522333, + "learning_rate": 7.248639429576226e-07, + "loss": 0.7172175, + "num_input_tokens_seen": 130241190, + "step": 6058, + "time_per_iteration": 2.4541234970092773 + }, + { + "auxiliary_loss_clip": 0.01153669, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.04661965, + "balance_loss_mlp": 1.02001333, + "epoch": 0.7285516743822521, + "flos": 25991856092160.0, + "grad_norm": 1.601315687084765, + "language_loss": 0.71790141, + "learning_rate": 7.242639245606959e-07, + "loss": 0.73971081, + "num_input_tokens_seen": 130260980, + "step": 6059, + "time_per_iteration": 2.473053455352783 + }, + { + "auxiliary_loss_clip": 0.01145635, + "auxiliary_loss_mlp": 0.01026477, + "balance_loss_clip": 1.04517055, + "balance_loss_mlp": 1.01922345, + "epoch": 0.7286719172728913, + "flos": 16399721675520.0, + "grad_norm": 1.8144714658929, + "language_loss": 0.82548076, + "learning_rate": 7.236640996919168e-07, + "loss": 0.84720182, + "num_input_tokens_seen": 130280025, + "step": 6060, + "time_per_iteration": 2.4468722343444824 + }, + { + "auxiliary_loss_clip": 0.01155341, + "auxiliary_loss_mlp": 0.01025189, + "balance_loss_clip": 1.04712296, + "balance_loss_mlp": 1.01819754, + "epoch": 0.7287921601635303, + "flos": 22018161277440.0, + "grad_norm": 1.5124226124954496, + "language_loss": 0.7071898, + "learning_rate": 7.230644684422782e-07, + "loss": 0.72899508, + "num_input_tokens_seen": 130300255, + "step": 6061, + "time_per_iteration": 2.454136610031128 + }, + { + "auxiliary_loss_clip": 0.0112672, + "auxiliary_loss_mlp": 0.01027259, + "balance_loss_clip": 1.04550767, + "balance_loss_mlp": 1.02050257, + "epoch": 0.7289124030541694, + "flos": 24600927784320.0, + "grad_norm": 1.712803627920868, + "language_loss": 0.8133564, + "learning_rate": 7.224650309027451e-07, + "loss": 0.83489615, + "num_input_tokens_seen": 130320005, + "step": 6062, + "time_per_iteration": 2.5313501358032227 + }, + { + "auxiliary_loss_clip": 0.01158991, + "auxiliary_loss_mlp": 0.01022877, + "balance_loss_clip": 1.05110061, + "balance_loss_mlp": 1.01581049, + "epoch": 0.7290326459448085, + "flos": 21393638484480.0, + "grad_norm": 2.899120479963915, + "language_loss": 0.68734419, + "learning_rate": 7.218657871642506e-07, + "loss": 0.70916289, + "num_input_tokens_seen": 130338810, + "step": 6063, + "time_per_iteration": 2.4475221633911133 + }, + { + "auxiliary_loss_clip": 0.01170728, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.04975176, + "balance_loss_mlp": 1.01953936, + "epoch": 0.7291528888354476, + "flos": 18587686821120.0, + "grad_norm": 2.0182695687246297, + "language_loss": 0.62324286, + "learning_rate": 7.212667373177012e-07, + "loss": 0.64521694, + "num_input_tokens_seen": 130353805, + "step": 6064, + "time_per_iteration": 2.3907220363616943 + }, + { + "auxiliary_loss_clip": 0.01125621, + "auxiliary_loss_mlp": 0.01023874, + "balance_loss_clip": 1.04461861, + "balance_loss_mlp": 1.01669705, + "epoch": 0.7292731317260867, + "flos": 18951066760320.0, + "grad_norm": 5.4491074619613, + "language_loss": 0.75430483, + "learning_rate": 7.206678814539704e-07, + "loss": 0.77579975, + "num_input_tokens_seen": 130372105, + "step": 6065, + "time_per_iteration": 2.501145124435425 + }, + { + "auxiliary_loss_clip": 0.01120068, + "auxiliary_loss_mlp": 0.0102462, + "balance_loss_clip": 1.04428017, + "balance_loss_mlp": 1.01791716, + "epoch": 0.7293933746167258, + "flos": 21067569797760.0, + "grad_norm": 2.071844414826551, + "language_loss": 0.72687268, + "learning_rate": 7.20069219663904e-07, + "loss": 0.74831951, + "num_input_tokens_seen": 130391990, + "step": 6066, + "time_per_iteration": 2.5456364154815674 + }, + { + "auxiliary_loss_clip": 0.01155773, + "auxiliary_loss_mlp": 0.01023117, + "balance_loss_clip": 1.04570138, + "balance_loss_mlp": 1.01618218, + "epoch": 0.7295136175073649, + "flos": 22453326547200.0, + "grad_norm": 1.775308638986487, + "language_loss": 0.79362333, + "learning_rate": 7.1947075203832e-07, + "loss": 0.81541228, + "num_input_tokens_seen": 130411970, + "step": 6067, + "time_per_iteration": 2.4571807384490967 + }, + { + "auxiliary_loss_clip": 0.01069948, + "auxiliary_loss_mlp": 0.01002267, + "balance_loss_clip": 1.0148766, + "balance_loss_mlp": 1.00118196, + "epoch": 0.7296338603980039, + "flos": 56125506648960.0, + "grad_norm": 0.8625292430452522, + "language_loss": 0.60186422, + "learning_rate": 7.188724786680049e-07, + "loss": 0.62258625, + "num_input_tokens_seen": 130472440, + "step": 6068, + "time_per_iteration": 3.006516695022583 + }, + { + "auxiliary_loss_clip": 0.0113927, + "auxiliary_loss_mlp": 0.01024096, + "balance_loss_clip": 1.04480898, + "balance_loss_mlp": 1.0167284, + "epoch": 0.7297541032886431, + "flos": 25228287751680.0, + "grad_norm": 1.7191336596005689, + "language_loss": 0.75628817, + "learning_rate": 7.182743996437162e-07, + "loss": 0.7779218, + "num_input_tokens_seen": 130491975, + "step": 6069, + "time_per_iteration": 2.518704652786255 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01022314, + "balance_loss_clip": 1.04339182, + "balance_loss_mlp": 1.01458955, + "epoch": 0.7298743461792822, + "flos": 26467600752000.0, + "grad_norm": 1.8681838726139628, + "language_loss": 0.6862877, + "learning_rate": 7.176765150561819e-07, + "loss": 0.70784003, + "num_input_tokens_seen": 130510580, + "step": 6070, + "time_per_iteration": 4.110658884048462 + }, + { + "auxiliary_loss_clip": 0.0116921, + "auxiliary_loss_mlp": 0.01023712, + "balance_loss_clip": 1.0474968, + "balance_loss_mlp": 1.0165863, + "epoch": 0.7299945890699212, + "flos": 19569053278080.0, + "grad_norm": 2.196190424696349, + "language_loss": 0.79589492, + "learning_rate": 7.170788249961002e-07, + "loss": 0.81782413, + "num_input_tokens_seen": 130529090, + "step": 6071, + "time_per_iteration": 2.4754576683044434 + }, + { + "auxiliary_loss_clip": 0.01164138, + "auxiliary_loss_mlp": 0.01023538, + "balance_loss_clip": 1.04709721, + "balance_loss_mlp": 1.01658547, + "epoch": 0.7301148319605604, + "flos": 22928963466240.0, + "grad_norm": 1.8685895341075822, + "language_loss": 0.88565135, + "learning_rate": 7.164813295541418e-07, + "loss": 0.90752816, + "num_input_tokens_seen": 130548655, + "step": 6072, + "time_per_iteration": 3.26410174369812 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01025777, + "balance_loss_clip": 1.04490077, + "balance_loss_mlp": 1.01874959, + "epoch": 0.7302350748511994, + "flos": 25369703596800.0, + "grad_norm": 1.607299385512349, + "language_loss": 0.70380247, + "learning_rate": 7.15884028820944e-07, + "loss": 0.72547454, + "num_input_tokens_seen": 130567710, + "step": 6073, + "time_per_iteration": 2.502211570739746 + }, + { + "auxiliary_loss_clip": 0.01118953, + "auxiliary_loss_mlp": 0.01021036, + "balance_loss_clip": 1.0405674, + "balance_loss_mlp": 1.01415753, + "epoch": 0.7303553177418385, + "flos": 27819170732160.0, + "grad_norm": 2.0068940394597554, + "language_loss": 0.60835981, + "learning_rate": 7.152869228871185e-07, + "loss": 0.62975967, + "num_input_tokens_seen": 130590195, + "step": 6074, + "time_per_iteration": 2.5531303882598877 + }, + { + "auxiliary_loss_clip": 0.01135023, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.04469085, + "balance_loss_mlp": 1.02057123, + "epoch": 0.7304755606324776, + "flos": 24426510318720.0, + "grad_norm": 2.0650139150635085, + "language_loss": 0.72399163, + "learning_rate": 7.146900118432457e-07, + "loss": 0.74562347, + "num_input_tokens_seen": 130609940, + "step": 6075, + "time_per_iteration": 3.325765371322632 + }, + { + "auxiliary_loss_clip": 0.01081949, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.03608203, + "balance_loss_mlp": 1.02258372, + "epoch": 0.7305958035231167, + "flos": 23840483927040.0, + "grad_norm": 1.6288849391698481, + "language_loss": 0.85704744, + "learning_rate": 7.140932957798753e-07, + "loss": 0.87816358, + "num_input_tokens_seen": 130628380, + "step": 6076, + "time_per_iteration": 2.7142691612243652 + }, + { + "auxiliary_loss_clip": 0.01142766, + "auxiliary_loss_mlp": 0.01022122, + "balance_loss_clip": 1.04408932, + "balance_loss_mlp": 1.01482594, + "epoch": 0.7307160464137558, + "flos": 16726939597440.0, + "grad_norm": 2.9542674738387116, + "language_loss": 0.71352798, + "learning_rate": 7.134967747875309e-07, + "loss": 0.73517686, + "num_input_tokens_seen": 130646590, + "step": 6077, + "time_per_iteration": 2.49676775932312 + }, + { + "auxiliary_loss_clip": 0.01149278, + "auxiliary_loss_mlp": 0.01025124, + "balance_loss_clip": 1.04496181, + "balance_loss_mlp": 1.01800728, + "epoch": 0.7308362893043949, + "flos": 21798280172160.0, + "grad_norm": 2.110313043272194, + "language_loss": 0.8193149, + "learning_rate": 7.129004489567014e-07, + "loss": 0.84105891, + "num_input_tokens_seen": 130664070, + "step": 6078, + "time_per_iteration": 2.446838617324829 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01022242, + "balance_loss_clip": 1.04479504, + "balance_loss_mlp": 1.01500595, + "epoch": 0.730956532195034, + "flos": 10707377840640.0, + "grad_norm": 2.174305448171294, + "language_loss": 0.78347278, + "learning_rate": 7.123043183778512e-07, + "loss": 0.80500269, + "num_input_tokens_seen": 130681400, + "step": 6079, + "time_per_iteration": 2.5183959007263184 + }, + { + "auxiliary_loss_clip": 0.01133809, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.04717827, + "balance_loss_mlp": 1.02471328, + "epoch": 0.731076775085673, + "flos": 19791987039360.0, + "grad_norm": 2.3414133347998134, + "language_loss": 0.65093482, + "learning_rate": 7.117083831414114e-07, + "loss": 0.67259181, + "num_input_tokens_seen": 130700675, + "step": 6080, + "time_per_iteration": 2.5625460147857666 + }, + { + "auxiliary_loss_clip": 0.0116518, + "auxiliary_loss_mlp": 0.01022285, + "balance_loss_clip": 1.04853845, + "balance_loss_mlp": 1.01529944, + "epoch": 0.7311970179763122, + "flos": 20447033414400.0, + "grad_norm": 1.9020191974456258, + "language_loss": 0.69872558, + "learning_rate": 7.11112643337787e-07, + "loss": 0.72060025, + "num_input_tokens_seen": 130719720, + "step": 6081, + "time_per_iteration": 2.460615873336792 + }, + { + "auxiliary_loss_clip": 0.01143326, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.0495975, + "balance_loss_mlp": 1.02253628, + "epoch": 0.7313172608669513, + "flos": 18513818501760.0, + "grad_norm": 2.0916858946149977, + "language_loss": 0.76452732, + "learning_rate": 7.10517099057349e-07, + "loss": 0.78626251, + "num_input_tokens_seen": 130736670, + "step": 6082, + "time_per_iteration": 2.538703203201294 + }, + { + "auxiliary_loss_clip": 0.01138001, + "auxiliary_loss_mlp": 0.01021984, + "balance_loss_clip": 1.04422379, + "balance_loss_mlp": 1.01442039, + "epoch": 0.7314375037575903, + "flos": 16180738410240.0, + "grad_norm": 3.406099754138424, + "language_loss": 0.61160922, + "learning_rate": 7.099217503904411e-07, + "loss": 0.63320905, + "num_input_tokens_seen": 130754525, + "step": 6083, + "time_per_iteration": 2.470655918121338 + }, + { + "auxiliary_loss_clip": 0.01142672, + "auxiliary_loss_mlp": 0.01023461, + "balance_loss_clip": 1.04683614, + "balance_loss_mlp": 1.01662695, + "epoch": 0.7315577466482295, + "flos": 17967940536960.0, + "grad_norm": 2.168818214662408, + "language_loss": 0.90080929, + "learning_rate": 7.093265974273788e-07, + "loss": 0.92247063, + "num_input_tokens_seen": 130772420, + "step": 6084, + "time_per_iteration": 2.451982259750366 + }, + { + "auxiliary_loss_clip": 0.0115271, + "auxiliary_loss_mlp": 0.01021605, + "balance_loss_clip": 1.04469919, + "balance_loss_mlp": 1.01465845, + "epoch": 0.7316779895388685, + "flos": 18405440190720.0, + "grad_norm": 1.8290021345354643, + "language_loss": 0.72191536, + "learning_rate": 7.087316402584447e-07, + "loss": 0.74365848, + "num_input_tokens_seen": 130791245, + "step": 6085, + "time_per_iteration": 2.4172520637512207 + }, + { + "auxiliary_loss_clip": 0.0116766, + "auxiliary_loss_mlp": 0.01020727, + "balance_loss_clip": 1.04932082, + "balance_loss_mlp": 1.01332378, + "epoch": 0.7317982324295076, + "flos": 17928294900480.0, + "grad_norm": 1.975844527751707, + "language_loss": 0.863065, + "learning_rate": 7.081368789738953e-07, + "loss": 0.88494885, + "num_input_tokens_seen": 130808445, + "step": 6086, + "time_per_iteration": 2.376862049102783 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01023305, + "balance_loss_clip": 1.0412426, + "balance_loss_mlp": 1.01580071, + "epoch": 0.7319184753201466, + "flos": 27229840289280.0, + "grad_norm": 2.4064825638920206, + "language_loss": 0.77936935, + "learning_rate": 7.075423136639537e-07, + "loss": 0.80093479, + "num_input_tokens_seen": 130827700, + "step": 6087, + "time_per_iteration": 2.518655300140381 + }, + { + "auxiliary_loss_clip": 0.01117999, + "auxiliary_loss_mlp": 0.01023328, + "balance_loss_clip": 1.04141688, + "balance_loss_mlp": 1.0159725, + "epoch": 0.7320387182107858, + "flos": 37448544574080.0, + "grad_norm": 2.1011986921515855, + "language_loss": 0.74276489, + "learning_rate": 7.069479444188149e-07, + "loss": 0.7641781, + "num_input_tokens_seen": 130848290, + "step": 6088, + "time_per_iteration": 2.634871482849121 + }, + { + "auxiliary_loss_clip": 0.01132309, + "auxiliary_loss_mlp": 0.01023598, + "balance_loss_clip": 1.04561925, + "balance_loss_mlp": 1.01604056, + "epoch": 0.7321589611014249, + "flos": 17859023521920.0, + "grad_norm": 1.822385432380353, + "language_loss": 0.82249892, + "learning_rate": 7.063537713286453e-07, + "loss": 0.84405804, + "num_input_tokens_seen": 130865970, + "step": 6089, + "time_per_iteration": 2.4458703994750977 + }, + { + "auxiliary_loss_clip": 0.01145208, + "auxiliary_loss_mlp": 0.01022444, + "balance_loss_clip": 1.04608583, + "balance_loss_mlp": 1.01515746, + "epoch": 0.7322792039920639, + "flos": 26100593539200.0, + "grad_norm": 1.7472471266449063, + "language_loss": 0.80879009, + "learning_rate": 7.057597944835803e-07, + "loss": 0.83046657, + "num_input_tokens_seen": 130885245, + "step": 6090, + "time_per_iteration": 2.5119845867156982 + }, + { + "auxiliary_loss_clip": 0.01131904, + "auxiliary_loss_mlp": 0.01021081, + "balance_loss_clip": 1.04381573, + "balance_loss_mlp": 1.01420283, + "epoch": 0.7323994468827031, + "flos": 25369093065600.0, + "grad_norm": 1.550271164377914, + "language_loss": 0.745031, + "learning_rate": 7.051660139737253e-07, + "loss": 0.76656085, + "num_input_tokens_seen": 130903465, + "step": 6091, + "time_per_iteration": 2.5374433994293213 + }, + { + "auxiliary_loss_clip": 0.01151214, + "auxiliary_loss_mlp": 0.00761542, + "balance_loss_clip": 1.04873168, + "balance_loss_mlp": 1.00032687, + "epoch": 0.7325196897733421, + "flos": 26907075653760.0, + "grad_norm": 2.404974823497621, + "language_loss": 0.7659322, + "learning_rate": 7.045724298891565e-07, + "loss": 0.78505969, + "num_input_tokens_seen": 130922935, + "step": 6092, + "time_per_iteration": 2.483245849609375 + }, + { + "auxiliary_loss_clip": 0.01152253, + "auxiliary_loss_mlp": 0.01023174, + "balance_loss_clip": 1.04722667, + "balance_loss_mlp": 1.01623547, + "epoch": 0.7326399326639812, + "flos": 25775781828480.0, + "grad_norm": 1.9320790573908304, + "language_loss": 0.69105494, + "learning_rate": 7.039790423199192e-07, + "loss": 0.71280921, + "num_input_tokens_seen": 130942575, + "step": 6093, + "time_per_iteration": 2.464826822280884 + }, + { + "auxiliary_loss_clip": 0.01142848, + "auxiliary_loss_mlp": 0.01024626, + "balance_loss_clip": 1.04637527, + "balance_loss_mlp": 1.01764011, + "epoch": 0.7327601755546204, + "flos": 21032269706880.0, + "grad_norm": 1.9397062132468301, + "language_loss": 0.77659988, + "learning_rate": 7.033858513560322e-07, + "loss": 0.79827464, + "num_input_tokens_seen": 130958870, + "step": 6094, + "time_per_iteration": 2.446718454360962 + }, + { + "auxiliary_loss_clip": 0.01153437, + "auxiliary_loss_mlp": 0.01022726, + "balance_loss_clip": 1.0477072, + "balance_loss_mlp": 1.01576138, + "epoch": 0.7328804184452594, + "flos": 16289224462080.0, + "grad_norm": 2.550591191607863, + "language_loss": 0.76353878, + "learning_rate": 7.027928570874794e-07, + "loss": 0.78530037, + "num_input_tokens_seen": 130977060, + "step": 6095, + "time_per_iteration": 2.411625623703003 + }, + { + "auxiliary_loss_clip": 0.01165811, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.04815173, + "balance_loss_mlp": 1.0187484, + "epoch": 0.7330006613358985, + "flos": 17858233422720.0, + "grad_norm": 1.8203880795264558, + "language_loss": 0.85420823, + "learning_rate": 7.022000596042194e-07, + "loss": 0.8761251, + "num_input_tokens_seen": 130994160, + "step": 6096, + "time_per_iteration": 2.4382669925689697 + }, + { + "auxiliary_loss_clip": 0.01126081, + "auxiliary_loss_mlp": 0.0102531, + "balance_loss_clip": 1.04124975, + "balance_loss_mlp": 1.0181396, + "epoch": 0.7331209042265376, + "flos": 22492074343680.0, + "grad_norm": 2.0782956590857826, + "language_loss": 0.82074994, + "learning_rate": 7.016074589961784e-07, + "loss": 0.84226382, + "num_input_tokens_seen": 131012725, + "step": 6097, + "time_per_iteration": 4.066557884216309 + }, + { + "auxiliary_loss_clip": 0.01137424, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.0472331, + "balance_loss_mlp": 1.0173645, + "epoch": 0.7332411471171767, + "flos": 33072757937280.0, + "grad_norm": 3.373020572821285, + "language_loss": 0.66768903, + "learning_rate": 7.01015055353253e-07, + "loss": 0.68931174, + "num_input_tokens_seen": 131035150, + "step": 6098, + "time_per_iteration": 3.427626132965088 + }, + { + "auxiliary_loss_clip": 0.01104349, + "auxiliary_loss_mlp": 0.01025622, + "balance_loss_clip": 1.04264235, + "balance_loss_mlp": 1.01781642, + "epoch": 0.7333613900078157, + "flos": 22743017735040.0, + "grad_norm": 1.6594426146958103, + "language_loss": 0.77940929, + "learning_rate": 7.004228487653123e-07, + "loss": 0.80070901, + "num_input_tokens_seen": 131055955, + "step": 6099, + "time_per_iteration": 2.600335121154785 + }, + { + "auxiliary_loss_clip": 0.01119688, + "auxiliary_loss_mlp": 0.01022542, + "balance_loss_clip": 1.03903115, + "balance_loss_mlp": 1.0155654, + "epoch": 0.7334816328984549, + "flos": 22346133384960.0, + "grad_norm": 2.3250237476893187, + "language_loss": 0.78377461, + "learning_rate": 6.998308393221906e-07, + "loss": 0.80519694, + "num_input_tokens_seen": 131074360, + "step": 6100, + "time_per_iteration": 2.4972546100616455 + }, + { + "auxiliary_loss_clip": 0.01126922, + "auxiliary_loss_mlp": 0.01025261, + "balance_loss_clip": 1.04453993, + "balance_loss_mlp": 1.01818848, + "epoch": 0.733601875789094, + "flos": 20736149984640.0, + "grad_norm": 2.0025848768325925, + "language_loss": 0.71113706, + "learning_rate": 6.992390271136977e-07, + "loss": 0.73265886, + "num_input_tokens_seen": 131090070, + "step": 6101, + "time_per_iteration": 2.498704433441162 + }, + { + "auxiliary_loss_clip": 0.01146787, + "auxiliary_loss_mlp": 0.01020907, + "balance_loss_clip": 1.04429829, + "balance_loss_mlp": 1.01414514, + "epoch": 0.733722118679733, + "flos": 22564362464640.0, + "grad_norm": 1.8165193175406134, + "language_loss": 0.85598075, + "learning_rate": 6.986474122296094e-07, + "loss": 0.87765771, + "num_input_tokens_seen": 131109185, + "step": 6102, + "time_per_iteration": 3.1823179721832275 + }, + { + "auxiliary_loss_clip": 0.01172218, + "auxiliary_loss_mlp": 0.01020854, + "balance_loss_clip": 1.05054927, + "balance_loss_mlp": 1.01384151, + "epoch": 0.7338423615703722, + "flos": 20084192179200.0, + "grad_norm": 3.075498197473535, + "language_loss": 0.72113538, + "learning_rate": 6.980559947596751e-07, + "loss": 0.74306607, + "num_input_tokens_seen": 131127725, + "step": 6103, + "time_per_iteration": 2.397486686706543 + }, + { + "auxiliary_loss_clip": 0.01108667, + "auxiliary_loss_mlp": 0.01022405, + "balance_loss_clip": 1.04046476, + "balance_loss_mlp": 1.01496959, + "epoch": 0.7339626044610112, + "flos": 21687675217920.0, + "grad_norm": 2.1976276906123364, + "language_loss": 0.75900066, + "learning_rate": 6.974647747936109e-07, + "loss": 0.78031135, + "num_input_tokens_seen": 131146110, + "step": 6104, + "time_per_iteration": 2.5461323261260986 + }, + { + "auxiliary_loss_clip": 0.01169647, + "auxiliary_loss_mlp": 0.00761873, + "balance_loss_clip": 1.05007541, + "balance_loss_mlp": 1.00040174, + "epoch": 0.7340828473516503, + "flos": 15268248282240.0, + "grad_norm": 2.5093309413384755, + "language_loss": 0.82421112, + "learning_rate": 6.968737524211039e-07, + "loss": 0.84352636, + "num_input_tokens_seen": 131162920, + "step": 6105, + "time_per_iteration": 2.3735501766204834 + }, + { + "auxiliary_loss_clip": 0.01153453, + "auxiliary_loss_mlp": 0.01024431, + "balance_loss_clip": 1.04783189, + "balance_loss_mlp": 1.01715636, + "epoch": 0.7342030902422895, + "flos": 22930112701440.0, + "grad_norm": 1.9187462857665418, + "language_loss": 0.80317259, + "learning_rate": 6.962829277318132e-07, + "loss": 0.82495141, + "num_input_tokens_seen": 131182515, + "step": 6106, + "time_per_iteration": 2.444582223892212 + }, + { + "auxiliary_loss_clip": 0.01156594, + "auxiliary_loss_mlp": 0.01025203, + "balance_loss_clip": 1.05060506, + "balance_loss_mlp": 1.01857531, + "epoch": 0.7343233331329285, + "flos": 25847890381440.0, + "grad_norm": 1.7944830951726432, + "language_loss": 0.83404624, + "learning_rate": 6.956923008153652e-07, + "loss": 0.85586423, + "num_input_tokens_seen": 131202280, + "step": 6107, + "time_per_iteration": 2.4748213291168213 + }, + { + "auxiliary_loss_clip": 0.01153147, + "auxiliary_loss_mlp": 0.0102744, + "balance_loss_clip": 1.04541135, + "balance_loss_mlp": 1.02101445, + "epoch": 0.7344435760235676, + "flos": 18478985287680.0, + "grad_norm": 2.1985630007422965, + "language_loss": 0.84163302, + "learning_rate": 6.951018717613593e-07, + "loss": 0.86343884, + "num_input_tokens_seen": 131221295, + "step": 6108, + "time_per_iteration": 2.452054738998413 + }, + { + "auxiliary_loss_clip": 0.01152709, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.0479449, + "balance_loss_mlp": 1.01766443, + "epoch": 0.7345638189142067, + "flos": 17640040256640.0, + "grad_norm": 2.111463295746286, + "language_loss": 0.7824921, + "learning_rate": 6.945116406593614e-07, + "loss": 0.80426526, + "num_input_tokens_seen": 131240150, + "step": 6109, + "time_per_iteration": 2.4198927879333496 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.04510975, + "balance_loss_mlp": 1.01817298, + "epoch": 0.7346840618048458, + "flos": 20260225756800.0, + "grad_norm": 2.0335258083724725, + "language_loss": 0.74789572, + "learning_rate": 6.939216075989089e-07, + "loss": 0.76929587, + "num_input_tokens_seen": 131258080, + "step": 6110, + "time_per_iteration": 2.5281388759613037 + }, + { + "auxiliary_loss_clip": 0.01136416, + "auxiliary_loss_mlp": 0.0102096, + "balance_loss_clip": 1.04391694, + "balance_loss_mlp": 1.01373553, + "epoch": 0.7348043046954849, + "flos": 29023183641600.0, + "grad_norm": 2.0789368476138144, + "language_loss": 0.66281021, + "learning_rate": 6.933317726695109e-07, + "loss": 0.68438393, + "num_input_tokens_seen": 131279310, + "step": 6111, + "time_per_iteration": 2.523442506790161 + }, + { + "auxiliary_loss_clip": 0.0112566, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.04791582, + "balance_loss_mlp": 1.01862824, + "epoch": 0.734924547586124, + "flos": 17931203902080.0, + "grad_norm": 2.316041614826212, + "language_loss": 0.79884422, + "learning_rate": 6.92742135960644e-07, + "loss": 0.82036066, + "num_input_tokens_seen": 131297010, + "step": 6112, + "time_per_iteration": 2.471081495285034 + }, + { + "auxiliary_loss_clip": 0.01061137, + "auxiliary_loss_mlp": 0.01001304, + "balance_loss_clip": 1.01565647, + "balance_loss_mlp": 1.00023687, + "epoch": 0.7350447904767631, + "flos": 63588319850880.0, + "grad_norm": 0.8204316855328927, + "language_loss": 0.55722958, + "learning_rate": 6.921526975617556e-07, + "loss": 0.57785398, + "num_input_tokens_seen": 131356470, + "step": 6113, + "time_per_iteration": 3.0800039768218994 + }, + { + "auxiliary_loss_clip": 0.01141019, + "auxiliary_loss_mlp": 0.01024701, + "balance_loss_clip": 1.04361629, + "balance_loss_mlp": 1.01739979, + "epoch": 0.7351650333674021, + "flos": 21580015178880.0, + "grad_norm": 2.0189428341341573, + "language_loss": 0.75163162, + "learning_rate": 6.915634575622631e-07, + "loss": 0.77328885, + "num_input_tokens_seen": 131374985, + "step": 6114, + "time_per_iteration": 2.4784367084503174 + }, + { + "auxiliary_loss_clip": 0.01166651, + "auxiliary_loss_mlp": 0.01022242, + "balance_loss_clip": 1.04894483, + "balance_loss_mlp": 1.01523268, + "epoch": 0.7352852762580413, + "flos": 18186349184640.0, + "grad_norm": 1.8347008491799346, + "language_loss": 0.7092185, + "learning_rate": 6.909744160515532e-07, + "loss": 0.73110735, + "num_input_tokens_seen": 131393125, + "step": 6115, + "time_per_iteration": 2.388014316558838 + }, + { + "auxiliary_loss_clip": 0.01137416, + "auxiliary_loss_mlp": 0.01024828, + "balance_loss_clip": 1.0452559, + "balance_loss_mlp": 1.01779723, + "epoch": 0.7354055191486804, + "flos": 38910073063680.0, + "grad_norm": 1.995171613961482, + "language_loss": 0.69654024, + "learning_rate": 6.903855731189849e-07, + "loss": 0.71816266, + "num_input_tokens_seen": 131415760, + "step": 6116, + "time_per_iteration": 2.6465911865234375 + }, + { + "auxiliary_loss_clip": 0.01147112, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.04764831, + "balance_loss_mlp": 1.02231574, + "epoch": 0.7355257620393194, + "flos": 16289978647680.0, + "grad_norm": 2.388241065246289, + "language_loss": 0.82299089, + "learning_rate": 6.897969288538825e-07, + "loss": 0.84476262, + "num_input_tokens_seen": 131433705, + "step": 6117, + "time_per_iteration": 2.4507827758789062 + }, + { + "auxiliary_loss_clip": 0.01138816, + "auxiliary_loss_mlp": 0.01024592, + "balance_loss_clip": 1.0463208, + "balance_loss_mlp": 1.01778245, + "epoch": 0.7356460049299585, + "flos": 18114240631680.0, + "grad_norm": 2.3434490001092554, + "language_loss": 0.81514186, + "learning_rate": 6.892084833455452e-07, + "loss": 0.8367759, + "num_input_tokens_seen": 131453275, + "step": 6118, + "time_per_iteration": 2.453321695327759 + }, + { + "auxiliary_loss_clip": 0.0115268, + "auxiliary_loss_mlp": 0.01022325, + "balance_loss_clip": 1.04944384, + "balance_loss_mlp": 1.01560163, + "epoch": 0.7357662478205976, + "flos": 21325193118720.0, + "grad_norm": 1.4434957043541996, + "language_loss": 0.83912963, + "learning_rate": 6.886202366832384e-07, + "loss": 0.86087966, + "num_input_tokens_seen": 131474960, + "step": 6119, + "time_per_iteration": 2.463392972946167 + }, + { + "auxiliary_loss_clip": 0.01113138, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.04653382, + "balance_loss_mlp": 1.01610804, + "epoch": 0.7358864907112367, + "flos": 14246841139200.0, + "grad_norm": 9.36868885106388, + "language_loss": 0.73476374, + "learning_rate": 6.880321889561987e-07, + "loss": 0.75613296, + "num_input_tokens_seen": 131492935, + "step": 6120, + "time_per_iteration": 2.504666805267334 + }, + { + "auxiliary_loss_clip": 0.01119984, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.04332554, + "balance_loss_mlp": 1.01975584, + "epoch": 0.7360067336018757, + "flos": 22309684058880.0, + "grad_norm": 2.763689757625933, + "language_loss": 0.65157712, + "learning_rate": 6.874443402536338e-07, + "loss": 0.67305475, + "num_input_tokens_seen": 131512025, + "step": 6121, + "time_per_iteration": 2.507908821105957 + }, + { + "auxiliary_loss_clip": 0.01142145, + "auxiliary_loss_mlp": 0.01020396, + "balance_loss_clip": 1.04724443, + "balance_loss_mlp": 1.01326132, + "epoch": 0.7361269764925149, + "flos": 25554607833600.0, + "grad_norm": 2.8295515221300467, + "language_loss": 0.80553359, + "learning_rate": 6.868566906647177e-07, + "loss": 0.82715905, + "num_input_tokens_seen": 131532975, + "step": 6122, + "time_per_iteration": 2.5304811000823975 + }, + { + "auxiliary_loss_clip": 0.01153464, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.04568267, + "balance_loss_mlp": 1.02060509, + "epoch": 0.736247219383154, + "flos": 20376505059840.0, + "grad_norm": 2.8301423462248514, + "language_loss": 0.83408904, + "learning_rate": 6.862692402785984e-07, + "loss": 0.85590565, + "num_input_tokens_seen": 131553225, + "step": 6123, + "time_per_iteration": 3.161473274230957 + }, + { + "auxiliary_loss_clip": 0.01033688, + "auxiliary_loss_mlp": 0.01004321, + "balance_loss_clip": 1.01557016, + "balance_loss_mlp": 1.00307488, + "epoch": 0.736367462273793, + "flos": 70339525735680.0, + "grad_norm": 0.7948722943119283, + "language_loss": 0.4958145, + "learning_rate": 6.856819891843899e-07, + "loss": 0.51619458, + "num_input_tokens_seen": 131617930, + "step": 6124, + "time_per_iteration": 4.001164436340332 + }, + { + "auxiliary_loss_clip": 0.01096068, + "auxiliary_loss_mlp": 0.01027882, + "balance_loss_clip": 1.04080462, + "balance_loss_mlp": 1.02078855, + "epoch": 0.7364877051644322, + "flos": 22412711243520.0, + "grad_norm": 1.977683330044527, + "language_loss": 0.72185355, + "learning_rate": 6.8509493747118e-07, + "loss": 0.74309307, + "num_input_tokens_seen": 131636740, + "step": 6125, + "time_per_iteration": 3.3232955932617188 + }, + { + "auxiliary_loss_clip": 0.01167659, + "auxiliary_loss_mlp": 0.01024772, + "balance_loss_clip": 1.04850829, + "balance_loss_mlp": 1.01767015, + "epoch": 0.7366079480550712, + "flos": 12130266274560.0, + "grad_norm": 4.056533982848506, + "language_loss": 0.87854242, + "learning_rate": 6.845080852280221e-07, + "loss": 0.90046674, + "num_input_tokens_seen": 131653810, + "step": 6126, + "time_per_iteration": 2.367185115814209 + }, + { + "auxiliary_loss_clip": 0.0112573, + "auxiliary_loss_mlp": 0.01021132, + "balance_loss_clip": 1.0436331, + "balance_loss_mlp": 1.01472116, + "epoch": 0.7367281909457103, + "flos": 15049336844160.0, + "grad_norm": 1.608921876373448, + "language_loss": 0.742347, + "learning_rate": 6.839214325439409e-07, + "loss": 0.76381558, + "num_input_tokens_seen": 131671505, + "step": 6127, + "time_per_iteration": 2.48690128326416 + }, + { + "auxiliary_loss_clip": 0.01133006, + "auxiliary_loss_mlp": 0.01024216, + "balance_loss_clip": 1.04617584, + "balance_loss_mlp": 1.0170753, + "epoch": 0.7368484338363495, + "flos": 23510752053120.0, + "grad_norm": 2.39098191918476, + "language_loss": 0.7174834, + "learning_rate": 6.833349795079327e-07, + "loss": 0.73905563, + "num_input_tokens_seen": 131690615, + "step": 6128, + "time_per_iteration": 3.2437474727630615 + }, + { + "auxiliary_loss_clip": 0.01125568, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.04596829, + "balance_loss_mlp": 1.01815772, + "epoch": 0.7369686767269885, + "flos": 27417833095680.0, + "grad_norm": 1.66657925085952, + "language_loss": 0.68344021, + "learning_rate": 6.827487262089613e-07, + "loss": 0.70494872, + "num_input_tokens_seen": 131711120, + "step": 6129, + "time_per_iteration": 2.5470545291900635 + }, + { + "auxiliary_loss_clip": 0.01040807, + "auxiliary_loss_mlp": 0.01001375, + "balance_loss_clip": 1.01108766, + "balance_loss_mlp": 1.00024879, + "epoch": 0.7370889196176276, + "flos": 70293343824000.0, + "grad_norm": 0.9011213052345667, + "language_loss": 0.56821793, + "learning_rate": 6.821626727359606e-07, + "loss": 0.58863974, + "num_input_tokens_seen": 131776680, + "step": 6130, + "time_per_iteration": 3.124809741973877 + }, + { + "auxiliary_loss_clip": 0.01143709, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.05106902, + "balance_loss_mlp": 1.01607335, + "epoch": 0.7372091625082667, + "flos": 18040839189120.0, + "grad_norm": 2.9872590313058676, + "language_loss": 0.76932818, + "learning_rate": 6.815768191778348e-07, + "loss": 0.79100323, + "num_input_tokens_seen": 131794760, + "step": 6131, + "time_per_iteration": 2.4574620723724365 + }, + { + "auxiliary_loss_clip": 0.01150286, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.04574406, + "balance_loss_mlp": 1.01939034, + "epoch": 0.7373294053989058, + "flos": 33726331854720.0, + "grad_norm": 2.072196462936505, + "language_loss": 0.7303524, + "learning_rate": 6.809911656234569e-07, + "loss": 0.7521227, + "num_input_tokens_seen": 131816735, + "step": 6132, + "time_per_iteration": 2.5581698417663574 + }, + { + "auxiliary_loss_clip": 0.01126694, + "auxiliary_loss_mlp": 0.01021401, + "balance_loss_clip": 1.04247594, + "balance_loss_mlp": 1.01460648, + "epoch": 0.7374496482895448, + "flos": 21506326427520.0, + "grad_norm": 2.0023466777862127, + "language_loss": 0.78540516, + "learning_rate": 6.804057121616707e-07, + "loss": 0.80688608, + "num_input_tokens_seen": 131834940, + "step": 6133, + "time_per_iteration": 2.5201940536499023 + }, + { + "auxiliary_loss_clip": 0.01154842, + "auxiliary_loss_mlp": 0.01024463, + "balance_loss_clip": 1.04774356, + "balance_loss_mlp": 1.0167737, + "epoch": 0.737569891180184, + "flos": 24936908624640.0, + "grad_norm": 2.035292912528272, + "language_loss": 0.72040772, + "learning_rate": 6.798204588812888e-07, + "loss": 0.74220073, + "num_input_tokens_seen": 131854355, + "step": 6134, + "time_per_iteration": 2.508913040161133 + }, + { + "auxiliary_loss_clip": 0.01084932, + "auxiliary_loss_mlp": 0.00761731, + "balance_loss_clip": 1.03742397, + "balance_loss_mlp": 1.00034809, + "epoch": 0.7376901340708231, + "flos": 20664544222080.0, + "grad_norm": 1.793526734011821, + "language_loss": 0.75624692, + "learning_rate": 6.792354058710937e-07, + "loss": 0.77471364, + "num_input_tokens_seen": 131871825, + "step": 6135, + "time_per_iteration": 2.5681512355804443 + }, + { + "auxiliary_loss_clip": 0.01161144, + "auxiliary_loss_mlp": 0.0102108, + "balance_loss_clip": 1.04705095, + "balance_loss_mlp": 1.01413596, + "epoch": 0.7378103769614621, + "flos": 23805794367360.0, + "grad_norm": 1.7824262238039135, + "language_loss": 0.65195465, + "learning_rate": 6.786505532198374e-07, + "loss": 0.67377687, + "num_input_tokens_seen": 131890770, + "step": 6136, + "time_per_iteration": 2.4442238807678223 + }, + { + "auxiliary_loss_clip": 0.01167465, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.04805684, + "balance_loss_mlp": 1.01902533, + "epoch": 0.7379306198521013, + "flos": 22237216369920.0, + "grad_norm": 1.6295217909728643, + "language_loss": 0.8544395, + "learning_rate": 6.780659010162411e-07, + "loss": 0.87638044, + "num_input_tokens_seen": 131909720, + "step": 6137, + "time_per_iteration": 2.4078242778778076 + }, + { + "auxiliary_loss_clip": 0.01130993, + "auxiliary_loss_mlp": 0.01023, + "balance_loss_clip": 1.04657292, + "balance_loss_mlp": 1.01618147, + "epoch": 0.7380508627427403, + "flos": 14903108576640.0, + "grad_norm": 1.7315278849538593, + "language_loss": 0.83306575, + "learning_rate": 6.774814493489975e-07, + "loss": 0.85460567, + "num_input_tokens_seen": 131927395, + "step": 6138, + "time_per_iteration": 2.4646408557891846 + }, + { + "auxiliary_loss_clip": 0.01150791, + "auxiliary_loss_mlp": 0.01023401, + "balance_loss_clip": 1.04736924, + "balance_loss_mlp": 1.01627243, + "epoch": 0.7381711056333794, + "flos": 21685843624320.0, + "grad_norm": 1.8481432317643938, + "language_loss": 0.66169411, + "learning_rate": 6.768971983067655e-07, + "loss": 0.68343604, + "num_input_tokens_seen": 131947725, + "step": 6139, + "time_per_iteration": 2.444254159927368 + }, + { + "auxiliary_loss_clip": 0.01068921, + "auxiliary_loss_mlp": 0.01000978, + "balance_loss_clip": 1.01360869, + "balance_loss_mlp": 0.99999475, + "epoch": 0.7382913485240186, + "flos": 52404263596800.0, + "grad_norm": 1.008046804286874, + "language_loss": 0.67838311, + "learning_rate": 6.763131479781772e-07, + "loss": 0.69908208, + "num_input_tokens_seen": 131997485, + "step": 6140, + "time_per_iteration": 2.824784517288208 + }, + { + "auxiliary_loss_clip": 0.01132157, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.04671896, + "balance_loss_mlp": 1.01700807, + "epoch": 0.7384115914146576, + "flos": 21798818876160.0, + "grad_norm": 2.146481648944633, + "language_loss": 0.76783538, + "learning_rate": 6.757292984518316e-07, + "loss": 0.78939682, + "num_input_tokens_seen": 132016885, + "step": 6141, + "time_per_iteration": 2.456026792526245 + }, + { + "auxiliary_loss_clip": 0.01059514, + "auxiliary_loss_mlp": 0.01001796, + "balance_loss_clip": 1.01408243, + "balance_loss_mlp": 1.00078857, + "epoch": 0.7385318343052967, + "flos": 61494331662720.0, + "grad_norm": 0.7408605739698927, + "language_loss": 0.56436586, + "learning_rate": 6.751456498162981e-07, + "loss": 0.58497894, + "num_input_tokens_seen": 132075920, + "step": 6142, + "time_per_iteration": 2.9201762676239014 + }, + { + "auxiliary_loss_clip": 0.01150295, + "auxiliary_loss_mlp": 0.01021738, + "balance_loss_clip": 1.04317379, + "balance_loss_mlp": 1.01553011, + "epoch": 0.7386520771959358, + "flos": 17013757697280.0, + "grad_norm": 2.040981969223243, + "language_loss": 0.85481173, + "learning_rate": 6.745622021601174e-07, + "loss": 0.87653208, + "num_input_tokens_seen": 132092945, + "step": 6143, + "time_per_iteration": 2.398787498474121 + }, + { + "auxiliary_loss_clip": 0.01127585, + "auxiliary_loss_mlp": 0.01022816, + "balance_loss_clip": 1.0437746, + "balance_loss_mlp": 1.01579201, + "epoch": 0.7387723200865749, + "flos": 18770759464320.0, + "grad_norm": 1.9343672579084095, + "language_loss": 0.69383079, + "learning_rate": 6.739789555717954e-07, + "loss": 0.71533477, + "num_input_tokens_seen": 132109920, + "step": 6144, + "time_per_iteration": 2.471367597579956 + }, + { + "auxiliary_loss_clip": 0.01166018, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.04785812, + "balance_loss_mlp": 1.01795232, + "epoch": 0.738892562977214, + "flos": 22525542840960.0, + "grad_norm": 2.299903502533772, + "language_loss": 0.77365208, + "learning_rate": 6.733959101398124e-07, + "loss": 0.79556072, + "num_input_tokens_seen": 132128050, + "step": 6145, + "time_per_iteration": 2.4074547290802 + }, + { + "auxiliary_loss_clip": 0.01137132, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.04421115, + "balance_loss_mlp": 1.01754189, + "epoch": 0.7390128058678531, + "flos": 21501478091520.0, + "grad_norm": 1.6021813279585075, + "language_loss": 0.81531954, + "learning_rate": 6.728130659526143e-07, + "loss": 0.83694357, + "num_input_tokens_seen": 132145860, + "step": 6146, + "time_per_iteration": 2.473816156387329 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.04756582, + "balance_loss_mlp": 1.02216864, + "epoch": 0.7391330487584922, + "flos": 25776176878080.0, + "grad_norm": 2.4484079217432675, + "language_loss": 0.71100426, + "learning_rate": 6.7223042309862e-07, + "loss": 0.73272794, + "num_input_tokens_seen": 132166060, + "step": 6147, + "time_per_iteration": 2.5129122734069824 + }, + { + "auxiliary_loss_clip": 0.01149319, + "auxiliary_loss_mlp": 0.0102609, + "balance_loss_clip": 1.04482603, + "balance_loss_mlp": 1.01949739, + "epoch": 0.7392532916491312, + "flos": 28366736636160.0, + "grad_norm": 4.769843803939903, + "language_loss": 0.73553824, + "learning_rate": 6.716479816662144e-07, + "loss": 0.75729239, + "num_input_tokens_seen": 132187790, + "step": 6148, + "time_per_iteration": 2.5065290927886963 + }, + { + "auxiliary_loss_clip": 0.01142973, + "auxiliary_loss_mlp": 0.01022549, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.01551604, + "epoch": 0.7393735345397703, + "flos": 23585877348480.0, + "grad_norm": 1.9293338262648343, + "language_loss": 0.73069763, + "learning_rate": 6.710657417437531e-07, + "loss": 0.75235289, + "num_input_tokens_seen": 132207495, + "step": 6149, + "time_per_iteration": 2.544508934020996 + }, + { + "auxiliary_loss_clip": 0.01137384, + "auxiliary_loss_mlp": 0.01024611, + "balance_loss_clip": 1.04498255, + "balance_loss_mlp": 1.0176549, + "epoch": 0.7394937774304094, + "flos": 19974772373760.0, + "grad_norm": 2.4202571793140564, + "language_loss": 0.79693204, + "learning_rate": 6.704837034195628e-07, + "loss": 0.81855202, + "num_input_tokens_seen": 132225960, + "step": 6150, + "time_per_iteration": 4.103356122970581 + }, + { + "auxiliary_loss_clip": 0.01144622, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.04429317, + "balance_loss_mlp": 1.02390468, + "epoch": 0.7396140203210485, + "flos": 23478037741440.0, + "grad_norm": 1.6533003076769768, + "language_loss": 0.84895766, + "learning_rate": 6.699018667819376e-07, + "loss": 0.87071669, + "num_input_tokens_seen": 132245360, + "step": 6151, + "time_per_iteration": 2.4704983234405518 + }, + { + "auxiliary_loss_clip": 0.01150598, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.04499865, + "balance_loss_mlp": 1.02094853, + "epoch": 0.7397342632116876, + "flos": 25555433846400.0, + "grad_norm": 1.573566404652876, + "language_loss": 0.72694886, + "learning_rate": 6.693202319191415e-07, + "loss": 0.74874073, + "num_input_tokens_seen": 132267095, + "step": 6152, + "time_per_iteration": 3.3766157627105713 + }, + { + "auxiliary_loss_clip": 0.0116696, + "auxiliary_loss_mlp": 0.01026223, + "balance_loss_clip": 1.05151534, + "balance_loss_mlp": 1.01887691, + "epoch": 0.7398545061023267, + "flos": 24755021130240.0, + "grad_norm": 1.8394961097270544, + "language_loss": 0.7519151, + "learning_rate": 6.687387989194084e-07, + "loss": 0.77384698, + "num_input_tokens_seen": 132286610, + "step": 6153, + "time_per_iteration": 2.5108325481414795 + }, + { + "auxiliary_loss_clip": 0.01136614, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.04856825, + "balance_loss_mlp": 1.01886511, + "epoch": 0.7399747489929658, + "flos": 16508602776960.0, + "grad_norm": 1.7675194688513232, + "language_loss": 0.79115498, + "learning_rate": 6.681575678709404e-07, + "loss": 0.8127811, + "num_input_tokens_seen": 132305300, + "step": 6154, + "time_per_iteration": 2.4753262996673584 + }, + { + "auxiliary_loss_clip": 0.01152331, + "auxiliary_loss_mlp": 0.01022402, + "balance_loss_clip": 1.04705024, + "balance_loss_mlp": 1.01550889, + "epoch": 0.7400949918836048, + "flos": 24097065753600.0, + "grad_norm": 1.9513790591154117, + "language_loss": 0.70933914, + "learning_rate": 6.67576538861911e-07, + "loss": 0.73108649, + "num_input_tokens_seen": 132323875, + "step": 6155, + "time_per_iteration": 3.265145778656006 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01022995, + "balance_loss_clip": 1.0445075, + "balance_loss_mlp": 1.01632559, + "epoch": 0.740215234774244, + "flos": 21802517976960.0, + "grad_norm": 1.914649125924512, + "language_loss": 0.82184827, + "learning_rate": 6.669957119804612e-07, + "loss": 0.84342408, + "num_input_tokens_seen": 132345510, + "step": 6156, + "time_per_iteration": 2.5210795402526855 + }, + { + "auxiliary_loss_clip": 0.01147576, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.04784179, + "balance_loss_mlp": 1.01861393, + "epoch": 0.7403354776648831, + "flos": 18733196816640.0, + "grad_norm": 3.274396289426646, + "language_loss": 0.71954364, + "learning_rate": 6.66415087314702e-07, + "loss": 0.74127388, + "num_input_tokens_seen": 132360465, + "step": 6157, + "time_per_iteration": 2.4516172409057617 + }, + { + "auxiliary_loss_clip": 0.01139076, + "auxiliary_loss_mlp": 0.01019088, + "balance_loss_clip": 1.04441166, + "balance_loss_mlp": 1.01216197, + "epoch": 0.7404557205555221, + "flos": 16909581277440.0, + "grad_norm": 1.9876777460500796, + "language_loss": 0.73317605, + "learning_rate": 6.65834664952714e-07, + "loss": 0.75475764, + "num_input_tokens_seen": 132377915, + "step": 6158, + "time_per_iteration": 2.4498064517974854 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01022515, + "balance_loss_clip": 1.0433557, + "balance_loss_mlp": 1.01587176, + "epoch": 0.7405759634461613, + "flos": 21214408596480.0, + "grad_norm": 5.215755168968883, + "language_loss": 0.76198125, + "learning_rate": 6.652544449825457e-07, + "loss": 0.78345418, + "num_input_tokens_seen": 132398170, + "step": 6159, + "time_per_iteration": 2.5090584754943848 + }, + { + "auxiliary_loss_clip": 0.01146281, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.04439187, + "balance_loss_mlp": 1.02203774, + "epoch": 0.7406962063368003, + "flos": 20480106862080.0, + "grad_norm": 2.281255998077098, + "language_loss": 0.76381552, + "learning_rate": 6.646744274922182e-07, + "loss": 0.7855714, + "num_input_tokens_seen": 132416615, + "step": 6160, + "time_per_iteration": 2.4607415199279785 + }, + { + "auxiliary_loss_clip": 0.01141031, + "auxiliary_loss_mlp": 0.01019453, + "balance_loss_clip": 1.0450536, + "balance_loss_mlp": 1.0123508, + "epoch": 0.7408164492274394, + "flos": 19791915212160.0, + "grad_norm": 2.6430930941410273, + "language_loss": 0.75554162, + "learning_rate": 6.640946125697171e-07, + "loss": 0.77714646, + "num_input_tokens_seen": 132434145, + "step": 6161, + "time_per_iteration": 2.4523236751556396 + }, + { + "auxiliary_loss_clip": 0.01153634, + "auxiliary_loss_mlp": 0.01021035, + "balance_loss_clip": 1.04576206, + "balance_loss_mlp": 1.01379323, + "epoch": 0.7409366921180786, + "flos": 29204855654400.0, + "grad_norm": 1.7400778372577494, + "language_loss": 0.75125176, + "learning_rate": 6.635150003030017e-07, + "loss": 0.77299845, + "num_input_tokens_seen": 132452670, + "step": 6162, + "time_per_iteration": 2.5072247982025146 + }, + { + "auxiliary_loss_clip": 0.01109275, + "auxiliary_loss_mlp": 0.01022135, + "balance_loss_clip": 1.03884137, + "balance_loss_mlp": 1.0152154, + "epoch": 0.7410569350087176, + "flos": 22930004960640.0, + "grad_norm": 2.5277331856093337, + "language_loss": 0.85946649, + "learning_rate": 6.629355907799981e-07, + "loss": 0.88078064, + "num_input_tokens_seen": 132472475, + "step": 6163, + "time_per_iteration": 2.524101495742798 + }, + { + "auxiliary_loss_clip": 0.01152612, + "auxiliary_loss_mlp": 0.01022343, + "balance_loss_clip": 1.04496896, + "balance_loss_mlp": 1.01539588, + "epoch": 0.7411771778993567, + "flos": 30440397726720.0, + "grad_norm": 1.745145059194402, + "language_loss": 0.69244128, + "learning_rate": 6.623563840886015e-07, + "loss": 0.71419084, + "num_input_tokens_seen": 132493400, + "step": 6164, + "time_per_iteration": 2.5163278579711914 + }, + { + "auxiliary_loss_clip": 0.01148683, + "auxiliary_loss_mlp": 0.0102045, + "balance_loss_clip": 1.04535162, + "balance_loss_mlp": 1.01350021, + "epoch": 0.7412974207899958, + "flos": 20522050968960.0, + "grad_norm": 1.5921267757564497, + "language_loss": 0.69588256, + "learning_rate": 6.617773803166795e-07, + "loss": 0.71757388, + "num_input_tokens_seen": 132511725, + "step": 6165, + "time_per_iteration": 2.429363489151001 + }, + { + "auxiliary_loss_clip": 0.01144432, + "auxiliary_loss_mlp": 0.00762124, + "balance_loss_clip": 1.04588127, + "balance_loss_mlp": 1.00039768, + "epoch": 0.7414176636806349, + "flos": 22090700793600.0, + "grad_norm": 2.160269003135224, + "language_loss": 0.82313693, + "learning_rate": 6.611985795520634e-07, + "loss": 0.84220243, + "num_input_tokens_seen": 132530270, + "step": 6166, + "time_per_iteration": 2.475053071975708 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_clip": 1.04705679, + "balance_loss_mlp": 1.01823461, + "epoch": 0.7415379065712739, + "flos": 25155245445120.0, + "grad_norm": 1.9970311768991247, + "language_loss": 0.77234519, + "learning_rate": 6.606199818825588e-07, + "loss": 0.79394656, + "num_input_tokens_seen": 132550725, + "step": 6167, + "time_per_iteration": 2.542524814605713 + }, + { + "auxiliary_loss_clip": 0.01143904, + "auxiliary_loss_mlp": 0.01023678, + "balance_loss_clip": 1.04378426, + "balance_loss_mlp": 1.01652575, + "epoch": 0.7416581494619131, + "flos": 16871731320960.0, + "grad_norm": 3.0072285731994546, + "language_loss": 0.81650358, + "learning_rate": 6.600415873959377e-07, + "loss": 0.83817947, + "num_input_tokens_seen": 132568600, + "step": 6168, + "time_per_iteration": 2.438075065612793 + }, + { + "auxiliary_loss_clip": 0.01095927, + "auxiliary_loss_mlp": 0.00761202, + "balance_loss_clip": 1.04002655, + "balance_loss_mlp": 1.00032854, + "epoch": 0.7417783923525522, + "flos": 28438881102720.0, + "grad_norm": 1.9182251782080653, + "language_loss": 0.64667565, + "learning_rate": 6.594633961799437e-07, + "loss": 0.6652469, + "num_input_tokens_seen": 132587640, + "step": 6169, + "time_per_iteration": 2.639529228210449 + }, + { + "auxiliary_loss_clip": 0.01133024, + "auxiliary_loss_mlp": 0.01022993, + "balance_loss_clip": 1.04518294, + "balance_loss_mlp": 1.01583755, + "epoch": 0.7418986352431912, + "flos": 20084299920000.0, + "grad_norm": 2.1666729190003124, + "language_loss": 0.81358981, + "learning_rate": 6.588854083222857e-07, + "loss": 0.83514994, + "num_input_tokens_seen": 132607075, + "step": 6170, + "time_per_iteration": 2.5021615028381348 + }, + { + "auxiliary_loss_clip": 0.01144455, + "auxiliary_loss_mlp": 0.01026511, + "balance_loss_clip": 1.0483315, + "balance_loss_mlp": 1.01875067, + "epoch": 0.7420188781338304, + "flos": 18259571059200.0, + "grad_norm": 5.59677325555764, + "language_loss": 0.81034243, + "learning_rate": 6.583076239106444e-07, + "loss": 0.83205211, + "num_input_tokens_seen": 132625580, + "step": 6171, + "time_per_iteration": 2.461170196533203 + }, + { + "auxiliary_loss_clip": 0.01144108, + "auxiliary_loss_mlp": 0.0102294, + "balance_loss_clip": 1.04674983, + "balance_loss_mlp": 1.01540005, + "epoch": 0.7421391210244694, + "flos": 13771994319360.0, + "grad_norm": 2.001980496848609, + "language_loss": 0.75233805, + "learning_rate": 6.577300430326707e-07, + "loss": 0.77400851, + "num_input_tokens_seen": 132640525, + "step": 6172, + "time_per_iteration": 2.4620823860168457 + }, + { + "auxiliary_loss_clip": 0.01124835, + "auxiliary_loss_mlp": 0.01022405, + "balance_loss_clip": 1.04649031, + "balance_loss_mlp": 1.01547635, + "epoch": 0.7422593639151085, + "flos": 15961683317760.0, + "grad_norm": 2.3653042260689228, + "language_loss": 0.72111821, + "learning_rate": 6.571526657759821e-07, + "loss": 0.74259055, + "num_input_tokens_seen": 132656265, + "step": 6173, + "time_per_iteration": 2.4795925617218018 + }, + { + "auxiliary_loss_clip": 0.01146936, + "auxiliary_loss_mlp": 0.01020344, + "balance_loss_clip": 1.04446375, + "balance_loss_mlp": 1.01336098, + "epoch": 0.7423796068057477, + "flos": 30114400867200.0, + "grad_norm": 1.734758258463048, + "language_loss": 0.70693886, + "learning_rate": 6.565754922281663e-07, + "loss": 0.72861171, + "num_input_tokens_seen": 132678510, + "step": 6174, + "time_per_iteration": 2.5282599925994873 + }, + { + "auxiliary_loss_clip": 0.01137581, + "auxiliary_loss_mlp": 0.01027362, + "balance_loss_clip": 1.04447603, + "balance_loss_mlp": 1.02048039, + "epoch": 0.7424998496963867, + "flos": 20521907314560.0, + "grad_norm": 1.8061434100116054, + "language_loss": 0.78390324, + "learning_rate": 6.559985224767801e-07, + "loss": 0.8055526, + "num_input_tokens_seen": 132696385, + "step": 6175, + "time_per_iteration": 2.5167171955108643 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01026147, + "balance_loss_clip": 1.04436326, + "balance_loss_mlp": 1.01907206, + "epoch": 0.7426200925870258, + "flos": 21871573873920.0, + "grad_norm": 3.420312848934449, + "language_loss": 0.75403482, + "learning_rate": 6.55421756609349e-07, + "loss": 0.77559912, + "num_input_tokens_seen": 132714640, + "step": 6176, + "time_per_iteration": 2.4969983100891113 + }, + { + "auxiliary_loss_clip": 0.01152111, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.04969072, + "balance_loss_mlp": 1.02165723, + "epoch": 0.7427403354776649, + "flos": 26432049265920.0, + "grad_norm": 1.7760128107619337, + "language_loss": 0.79036349, + "learning_rate": 6.54845194713369e-07, + "loss": 0.8121779, + "num_input_tokens_seen": 132735590, + "step": 6177, + "time_per_iteration": 4.072268486022949 + }, + { + "auxiliary_loss_clip": 0.01147979, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.04616809, + "balance_loss_mlp": 1.02205801, + "epoch": 0.742860578368304, + "flos": 19898390102400.0, + "grad_norm": 1.8916684867949831, + "language_loss": 0.79498696, + "learning_rate": 6.542688368763034e-07, + "loss": 0.81675303, + "num_input_tokens_seen": 132753995, + "step": 6178, + "time_per_iteration": 3.271073579788208 + }, + { + "auxiliary_loss_clip": 0.01152102, + "auxiliary_loss_mlp": 0.01027465, + "balance_loss_clip": 1.0482794, + "balance_loss_mlp": 1.02083421, + "epoch": 0.742980821258943, + "flos": 24827201510400.0, + "grad_norm": 2.4925975872946586, + "language_loss": 0.76932037, + "learning_rate": 6.536926831855854e-07, + "loss": 0.791116, + "num_input_tokens_seen": 132773160, + "step": 6179, + "time_per_iteration": 2.45601749420166 + }, + { + "auxiliary_loss_clip": 0.01135752, + "auxiliary_loss_mlp": 0.01022823, + "balance_loss_clip": 1.0452354, + "balance_loss_mlp": 1.01607895, + "epoch": 0.7431010641495821, + "flos": 25228646887680.0, + "grad_norm": 2.9300099208392836, + "language_loss": 0.72909594, + "learning_rate": 6.531167337286165e-07, + "loss": 0.7506817, + "num_input_tokens_seen": 132793180, + "step": 6180, + "time_per_iteration": 2.4815125465393066 + }, + { + "auxiliary_loss_clip": 0.01138423, + "auxiliary_loss_mlp": 0.0102296, + "balance_loss_clip": 1.0474757, + "balance_loss_mlp": 1.01621222, + "epoch": 0.7432213070402213, + "flos": 21762369550080.0, + "grad_norm": 1.403660437381547, + "language_loss": 0.7972523, + "learning_rate": 6.52540988592768e-07, + "loss": 0.81886613, + "num_input_tokens_seen": 132814200, + "step": 6181, + "time_per_iteration": 2.4895172119140625 + }, + { + "auxiliary_loss_clip": 0.01140588, + "auxiliary_loss_mlp": 0.01020485, + "balance_loss_clip": 1.04636407, + "balance_loss_mlp": 1.01364517, + "epoch": 0.7433415499308603, + "flos": 14793832425600.0, + "grad_norm": 1.9555727524976907, + "language_loss": 0.83165836, + "learning_rate": 6.519654478653814e-07, + "loss": 0.85326904, + "num_input_tokens_seen": 132832565, + "step": 6182, + "time_per_iteration": 3.2044496536254883 + }, + { + "auxiliary_loss_clip": 0.01052846, + "auxiliary_loss_mlp": 0.01002711, + "balance_loss_clip": 1.01496482, + "balance_loss_mlp": 1.00160837, + "epoch": 0.7434617928214994, + "flos": 67155577297920.0, + "grad_norm": 0.7421561489254233, + "language_loss": 0.5614903, + "learning_rate": 6.51390111633763e-07, + "loss": 0.58204579, + "num_input_tokens_seen": 132897840, + "step": 6183, + "time_per_iteration": 3.105877637863159 + }, + { + "auxiliary_loss_clip": 0.01094522, + "auxiliary_loss_mlp": 0.01020144, + "balance_loss_clip": 1.03875446, + "balance_loss_mlp": 1.01344442, + "epoch": 0.7435820357121385, + "flos": 27377576928000.0, + "grad_norm": 1.6357073275961822, + "language_loss": 0.76207614, + "learning_rate": 6.508149799851932e-07, + "loss": 0.78322279, + "num_input_tokens_seen": 132919505, + "step": 6184, + "time_per_iteration": 2.60357928276062 + }, + { + "auxiliary_loss_clip": 0.01136877, + "auxiliary_loss_mlp": 0.01021035, + "balance_loss_clip": 1.04546118, + "balance_loss_mlp": 1.01407325, + "epoch": 0.7437022786027776, + "flos": 23987645948160.0, + "grad_norm": 2.062518575948394, + "language_loss": 0.61426228, + "learning_rate": 6.502400530069183e-07, + "loss": 0.63584143, + "num_input_tokens_seen": 132939390, + "step": 6185, + "time_per_iteration": 2.455746650695801 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.04526186, + "balance_loss_mlp": 1.0229888, + "epoch": 0.7438225214934167, + "flos": 21866761451520.0, + "grad_norm": 1.6500706286797915, + "language_loss": 0.68501008, + "learning_rate": 6.496653307861535e-07, + "loss": 0.70658195, + "num_input_tokens_seen": 132960060, + "step": 6186, + "time_per_iteration": 2.522322416305542 + }, + { + "auxiliary_loss_clip": 0.01157895, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.04782844, + "balance_loss_mlp": 1.01790082, + "epoch": 0.7439427643840558, + "flos": 20230097224320.0, + "grad_norm": 3.6776352129083842, + "language_loss": 0.65812725, + "learning_rate": 6.490908134100857e-07, + "loss": 0.6799562, + "num_input_tokens_seen": 132978525, + "step": 6187, + "time_per_iteration": 2.436244487762451 + }, + { + "auxiliary_loss_clip": 0.01158552, + "auxiliary_loss_mlp": 0.01021989, + "balance_loss_clip": 1.04821706, + "balance_loss_mlp": 1.01454389, + "epoch": 0.7440630072746949, + "flos": 20849915335680.0, + "grad_norm": 2.2181268072344094, + "language_loss": 0.69327259, + "learning_rate": 6.48516500965866e-07, + "loss": 0.71507794, + "num_input_tokens_seen": 132998460, + "step": 6188, + "time_per_iteration": 2.444633722305298 + }, + { + "auxiliary_loss_clip": 0.01154872, + "auxiliary_loss_mlp": 0.01021909, + "balance_loss_clip": 1.04431081, + "balance_loss_mlp": 1.01470256, + "epoch": 0.7441832501653339, + "flos": 26503762769280.0, + "grad_norm": 1.8008470146056523, + "language_loss": 0.81646919, + "learning_rate": 6.479423935406192e-07, + "loss": 0.83823705, + "num_input_tokens_seen": 133018445, + "step": 6189, + "time_per_iteration": 2.476483106613159 + }, + { + "auxiliary_loss_clip": 0.01044096, + "auxiliary_loss_mlp": 0.01001698, + "balance_loss_clip": 1.01650774, + "balance_loss_mlp": 1.00073802, + "epoch": 0.7443034930559731, + "flos": 68602848088320.0, + "grad_norm": 0.8120585173931032, + "language_loss": 0.61998612, + "learning_rate": 6.473684912214357e-07, + "loss": 0.64044404, + "num_input_tokens_seen": 133082005, + "step": 6190, + "time_per_iteration": 3.1991283893585205 + }, + { + "auxiliary_loss_clip": 0.01153062, + "auxiliary_loss_mlp": 0.01021295, + "balance_loss_clip": 1.04863572, + "balance_loss_mlp": 1.01427984, + "epoch": 0.7444237359466122, + "flos": 18654982951680.0, + "grad_norm": 5.5890299380191895, + "language_loss": 0.69773829, + "learning_rate": 6.467947940953778e-07, + "loss": 0.71948195, + "num_input_tokens_seen": 133100530, + "step": 6191, + "time_per_iteration": 2.4406604766845703 + }, + { + "auxiliary_loss_clip": 0.01138434, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.04459417, + "balance_loss_mlp": 1.02111959, + "epoch": 0.7445439788372512, + "flos": 22817604326400.0, + "grad_norm": 1.7888926965732181, + "language_loss": 0.72490942, + "learning_rate": 6.462213022494732e-07, + "loss": 0.74657297, + "num_input_tokens_seen": 133119775, + "step": 6192, + "time_per_iteration": 2.484675645828247 + }, + { + "auxiliary_loss_clip": 0.0106198, + "auxiliary_loss_mlp": 0.01001569, + "balance_loss_clip": 1.0152657, + "balance_loss_mlp": 1.00056195, + "epoch": 0.7446642217278904, + "flos": 67045690615680.0, + "grad_norm": 0.7677672766641063, + "language_loss": 0.61051834, + "learning_rate": 6.456480157707201e-07, + "loss": 0.63115382, + "num_input_tokens_seen": 133184550, + "step": 6193, + "time_per_iteration": 2.9859704971313477 + }, + { + "auxiliary_loss_clip": 0.01118272, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04254794, + "balance_loss_mlp": 1.0199039, + "epoch": 0.7447844646185294, + "flos": 17417465631360.0, + "grad_norm": 1.819256562295751, + "language_loss": 0.84713626, + "learning_rate": 6.450749347460866e-07, + "loss": 0.86859524, + "num_input_tokens_seen": 133201525, + "step": 6194, + "time_per_iteration": 2.4781603813171387 + }, + { + "auxiliary_loss_clip": 0.01167648, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_clip": 1.04753089, + "balance_loss_mlp": 1.01884818, + "epoch": 0.7449047075091685, + "flos": 26615876094720.0, + "grad_norm": 1.8097698630998473, + "language_loss": 0.78789788, + "learning_rate": 6.445020592625083e-07, + "loss": 0.80983186, + "num_input_tokens_seen": 133222175, + "step": 6195, + "time_per_iteration": 2.4590728282928467 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.04703259, + "balance_loss_mlp": 1.01885176, + "epoch": 0.7450249503998077, + "flos": 14170458867840.0, + "grad_norm": 2.310880236555306, + "language_loss": 0.80482769, + "learning_rate": 6.4392938940689e-07, + "loss": 0.8267557, + "num_input_tokens_seen": 133237590, + "step": 6196, + "time_per_iteration": 2.3711137771606445 + }, + { + "auxiliary_loss_clip": 0.01109084, + "auxiliary_loss_mlp": 0.00761268, + "balance_loss_clip": 1.04393423, + "balance_loss_mlp": 1.0003289, + "epoch": 0.7451451932904467, + "flos": 19606687752960.0, + "grad_norm": 2.3412638046916, + "language_loss": 0.72118419, + "learning_rate": 6.433569252661049e-07, + "loss": 0.73988771, + "num_input_tokens_seen": 133255590, + "step": 6197, + "time_per_iteration": 2.5289671421051025 + }, + { + "auxiliary_loss_clip": 0.01118567, + "auxiliary_loss_mlp": 0.01023067, + "balance_loss_clip": 1.04270315, + "balance_loss_mlp": 1.01637971, + "epoch": 0.7452654361810858, + "flos": 12495405980160.0, + "grad_norm": 1.7419545742608133, + "language_loss": 0.71128464, + "learning_rate": 6.427846669269952e-07, + "loss": 0.73270094, + "num_input_tokens_seen": 133273210, + "step": 6198, + "time_per_iteration": 2.473322629928589 + }, + { + "auxiliary_loss_clip": 0.01170042, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.05222011, + "balance_loss_mlp": 1.02088404, + "epoch": 0.7453856790717249, + "flos": 22127329687680.0, + "grad_norm": 2.2693060423334908, + "language_loss": 0.82625848, + "learning_rate": 6.422126144763729e-07, + "loss": 0.84823436, + "num_input_tokens_seen": 133292600, + "step": 6199, + "time_per_iteration": 2.4111764430999756 + }, + { + "auxiliary_loss_clip": 0.01123233, + "auxiliary_loss_mlp": 0.00761984, + "balance_loss_clip": 1.0410471, + "balance_loss_mlp": 1.00040412, + "epoch": 0.745505921962364, + "flos": 20010682995840.0, + "grad_norm": 3.7885517949085235, + "language_loss": 0.76948673, + "learning_rate": 6.416407680010174e-07, + "loss": 0.7883389, + "num_input_tokens_seen": 133306960, + "step": 6200, + "time_per_iteration": 2.4838271141052246 + }, + { + "auxiliary_loss_clip": 0.01126198, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.04662418, + "balance_loss_mlp": 1.01968622, + "epoch": 0.745626164853003, + "flos": 24677884673280.0, + "grad_norm": 1.9412734105472431, + "language_loss": 0.81055725, + "learning_rate": 6.410691275876774e-07, + "loss": 0.83209497, + "num_input_tokens_seen": 133326380, + "step": 6201, + "time_per_iteration": 2.561248540878296 + }, + { + "auxiliary_loss_clip": 0.01146038, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.04729605, + "balance_loss_mlp": 1.0167954, + "epoch": 0.7457464077436422, + "flos": 14538830797440.0, + "grad_norm": 3.983802267878781, + "language_loss": 0.76758128, + "learning_rate": 6.404976933230704e-07, + "loss": 0.78927982, + "num_input_tokens_seen": 133342900, + "step": 6202, + "time_per_iteration": 2.43813157081604 + }, + { + "auxiliary_loss_clip": 0.01144489, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.04676485, + "balance_loss_mlp": 1.01875329, + "epoch": 0.7458666506342813, + "flos": 34021194600960.0, + "grad_norm": 2.05260282849613, + "language_loss": 0.72575796, + "learning_rate": 6.399264652938813e-07, + "loss": 0.7474615, + "num_input_tokens_seen": 133363805, + "step": 6203, + "time_per_iteration": 2.574030876159668 + }, + { + "auxiliary_loss_clip": 0.01136499, + "auxiliary_loss_mlp": 0.01022459, + "balance_loss_clip": 1.04431498, + "balance_loss_mlp": 1.01539612, + "epoch": 0.7459868935249203, + "flos": 24279025075200.0, + "grad_norm": 1.9374598061480668, + "language_loss": 0.74500263, + "learning_rate": 6.393554435867679e-07, + "loss": 0.76659214, + "num_input_tokens_seen": 133384655, + "step": 6204, + "time_per_iteration": 4.084227561950684 + }, + { + "auxiliary_loss_clip": 0.01121001, + "auxiliary_loss_mlp": 0.01023674, + "balance_loss_clip": 1.04250634, + "balance_loss_mlp": 1.01597285, + "epoch": 0.7461071364155595, + "flos": 21908777385600.0, + "grad_norm": 1.9800080127206716, + "language_loss": 0.83316207, + "learning_rate": 6.387846282883502e-07, + "loss": 0.85460889, + "num_input_tokens_seen": 133401185, + "step": 6205, + "time_per_iteration": 3.3325726985931396 + }, + { + "auxiliary_loss_clip": 0.01164902, + "auxiliary_loss_mlp": 0.01021893, + "balance_loss_clip": 1.04754555, + "balance_loss_mlp": 1.0149188, + "epoch": 0.7462273793061985, + "flos": 22889712879360.0, + "grad_norm": 2.9273996422156014, + "language_loss": 0.76747751, + "learning_rate": 6.38214019485223e-07, + "loss": 0.78934544, + "num_input_tokens_seen": 133420010, + "step": 6206, + "time_per_iteration": 2.4077541828155518 + }, + { + "auxiliary_loss_clip": 0.01093658, + "auxiliary_loss_mlp": 0.01024832, + "balance_loss_clip": 1.04005182, + "balance_loss_mlp": 1.01764965, + "epoch": 0.7463476221968376, + "flos": 19968451580160.0, + "grad_norm": 1.859492413114982, + "language_loss": 0.71791303, + "learning_rate": 6.376436172639461e-07, + "loss": 0.73909795, + "num_input_tokens_seen": 133437855, + "step": 6207, + "time_per_iteration": 2.5536484718322754 + }, + { + "auxiliary_loss_clip": 0.01088513, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.04204917, + "balance_loss_mlp": 1.0219785, + "epoch": 0.7464678650874768, + "flos": 16836610798080.0, + "grad_norm": 2.3585745994790352, + "language_loss": 0.64730155, + "learning_rate": 6.370734217110487e-07, + "loss": 0.66848779, + "num_input_tokens_seen": 133456600, + "step": 6208, + "time_per_iteration": 3.3388936519622803 + }, + { + "auxiliary_loss_clip": 0.01142146, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.04774106, + "balance_loss_mlp": 1.01996636, + "epoch": 0.7465881079781158, + "flos": 48100869843840.0, + "grad_norm": 1.4505603185848304, + "language_loss": 0.64256763, + "learning_rate": 6.36503432913031e-07, + "loss": 0.66426361, + "num_input_tokens_seen": 133479745, + "step": 6209, + "time_per_iteration": 2.729149103164673 + }, + { + "auxiliary_loss_clip": 0.01150543, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.04698348, + "balance_loss_mlp": 1.01703, + "epoch": 0.7467083508687549, + "flos": 19677359761920.0, + "grad_norm": 2.0386818034073877, + "language_loss": 0.68967742, + "learning_rate": 6.359336509563569e-07, + "loss": 0.71142769, + "num_input_tokens_seen": 133495765, + "step": 6210, + "time_per_iteration": 2.407444953918457 + }, + { + "auxiliary_loss_clip": 0.01111907, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.04248261, + "balance_loss_mlp": 1.02367055, + "epoch": 0.7468285937593939, + "flos": 17895436934400.0, + "grad_norm": 1.7850169020944548, + "language_loss": 0.80585963, + "learning_rate": 6.353640759274641e-07, + "loss": 0.82728696, + "num_input_tokens_seen": 133514655, + "step": 6211, + "time_per_iteration": 2.466000556945801 + }, + { + "auxiliary_loss_clip": 0.01148716, + "auxiliary_loss_mlp": 0.01021498, + "balance_loss_clip": 1.0438211, + "balance_loss_mlp": 1.01427948, + "epoch": 0.7469488366500331, + "flos": 23141446369920.0, + "grad_norm": 3.815208700603355, + "language_loss": 0.73478317, + "learning_rate": 6.347947079127556e-07, + "loss": 0.75648534, + "num_input_tokens_seen": 133532555, + "step": 6212, + "time_per_iteration": 2.448587656021118 + }, + { + "auxiliary_loss_clip": 0.01135783, + "auxiliary_loss_mlp": 0.0102314, + "balance_loss_clip": 1.04566979, + "balance_loss_mlp": 1.01576042, + "epoch": 0.7470690795406721, + "flos": 16690849407360.0, + "grad_norm": 2.2187525466721603, + "language_loss": 0.77419782, + "learning_rate": 6.342255469986053e-07, + "loss": 0.79578704, + "num_input_tokens_seen": 133551300, + "step": 6213, + "time_per_iteration": 2.444871187210083 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01025067, + "balance_loss_clip": 1.04717588, + "balance_loss_mlp": 1.01775301, + "epoch": 0.7471893224313112, + "flos": 25192700352000.0, + "grad_norm": 1.8812834755729595, + "language_loss": 0.76202929, + "learning_rate": 6.336565932713533e-07, + "loss": 0.7839241, + "num_input_tokens_seen": 133570725, + "step": 6214, + "time_per_iteration": 2.43023419380188 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.0497694, + "balance_loss_mlp": 1.01788592, + "epoch": 0.7473095653219504, + "flos": 22526225199360.0, + "grad_norm": 1.9257933638632283, + "language_loss": 0.77874303, + "learning_rate": 6.330878468173088e-07, + "loss": 0.80038786, + "num_input_tokens_seen": 133590790, + "step": 6215, + "time_per_iteration": 2.4914753437042236 + }, + { + "auxiliary_loss_clip": 0.01144302, + "auxiliary_loss_mlp": 0.01021666, + "balance_loss_clip": 1.044276, + "balance_loss_mlp": 1.01465321, + "epoch": 0.7474298082125894, + "flos": 18113989236480.0, + "grad_norm": 1.833161394688964, + "language_loss": 0.73129159, + "learning_rate": 6.32519307722752e-07, + "loss": 0.75295126, + "num_input_tokens_seen": 133608685, + "step": 6216, + "time_per_iteration": 2.413458824157715 + }, + { + "auxiliary_loss_clip": 0.01040271, + "auxiliary_loss_mlp": 0.01002351, + "balance_loss_clip": 1.02107894, + "balance_loss_mlp": 1.00124824, + "epoch": 0.7475500511032285, + "flos": 62086535193600.0, + "grad_norm": 0.9147310792187742, + "language_loss": 0.54990089, + "learning_rate": 6.31950976073929e-07, + "loss": 0.5703271, + "num_input_tokens_seen": 133662775, + "step": 6217, + "time_per_iteration": 3.065585136413574 + }, + { + "auxiliary_loss_clip": 0.0111098, + "auxiliary_loss_mlp": 0.01023694, + "balance_loss_clip": 1.04553318, + "balance_loss_mlp": 1.01650882, + "epoch": 0.7476702939938676, + "flos": 17785586165760.0, + "grad_norm": 2.8981021199372297, + "language_loss": 0.8071233, + "learning_rate": 6.31382851957055e-07, + "loss": 0.82847011, + "num_input_tokens_seen": 133679595, + "step": 6218, + "time_per_iteration": 2.5134427547454834 + }, + { + "auxiliary_loss_clip": 0.01121556, + "auxiliary_loss_mlp": 0.00761805, + "balance_loss_clip": 1.04384255, + "balance_loss_mlp": 1.00042236, + "epoch": 0.7477905368845067, + "flos": 27927944092800.0, + "grad_norm": 1.9047534806383175, + "language_loss": 0.7137059, + "learning_rate": 6.308149354583143e-07, + "loss": 0.73253953, + "num_input_tokens_seen": 133699000, + "step": 6219, + "time_per_iteration": 2.542675256729126 + }, + { + "auxiliary_loss_clip": 0.01158034, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.04858708, + "balance_loss_mlp": 1.02016139, + "epoch": 0.7479107797751458, + "flos": 26870374932480.0, + "grad_norm": 2.067737699056454, + "language_loss": 0.82009405, + "learning_rate": 6.302472266638586e-07, + "loss": 0.84195369, + "num_input_tokens_seen": 133719540, + "step": 6220, + "time_per_iteration": 2.4710311889648438 + }, + { + "auxiliary_loss_clip": 0.01174272, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.05049396, + "balance_loss_mlp": 1.01959229, + "epoch": 0.7480310226657849, + "flos": 33943375785600.0, + "grad_norm": 2.3184549032649464, + "language_loss": 0.70081878, + "learning_rate": 6.296797256598101e-07, + "loss": 0.72283262, + "num_input_tokens_seen": 133741020, + "step": 6221, + "time_per_iteration": 2.5132362842559814 + }, + { + "auxiliary_loss_clip": 0.01115356, + "auxiliary_loss_mlp": 0.01020992, + "balance_loss_clip": 1.04266918, + "balance_loss_mlp": 1.01424766, + "epoch": 0.748151265556424, + "flos": 24826555065600.0, + "grad_norm": 1.6507157849663419, + "language_loss": 0.81287891, + "learning_rate": 6.291124325322576e-07, + "loss": 0.83424234, + "num_input_tokens_seen": 133761145, + "step": 6222, + "time_per_iteration": 2.529958486557007 + }, + { + "auxiliary_loss_clip": 0.01144576, + "auxiliary_loss_mlp": 0.01021906, + "balance_loss_clip": 1.04648924, + "balance_loss_mlp": 1.01461053, + "epoch": 0.748271508447063, + "flos": 38399351535360.0, + "grad_norm": 1.5315019259196678, + "language_loss": 0.62317777, + "learning_rate": 6.285453473672595e-07, + "loss": 0.64484257, + "num_input_tokens_seen": 133783715, + "step": 6223, + "time_per_iteration": 2.622898817062378 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.01022608, + "balance_loss_clip": 1.04584289, + "balance_loss_mlp": 1.01568222, + "epoch": 0.7483917513377022, + "flos": 21541842000000.0, + "grad_norm": 1.8350760105936486, + "language_loss": 0.75425625, + "learning_rate": 6.279784702508415e-07, + "loss": 0.77611214, + "num_input_tokens_seen": 133804465, + "step": 6224, + "time_per_iteration": 2.430347442626953 + }, + { + "auxiliary_loss_clip": 0.01042139, + "auxiliary_loss_mlp": 0.01001089, + "balance_loss_clip": 1.01416397, + "balance_loss_mlp": 1.00000405, + "epoch": 0.7485119942283412, + "flos": 62314532772480.0, + "grad_norm": 0.7883586487722969, + "language_loss": 0.5861026, + "learning_rate": 6.274118012689979e-07, + "loss": 0.6065349, + "num_input_tokens_seen": 133866365, + "step": 6225, + "time_per_iteration": 3.1924705505371094 + }, + { + "auxiliary_loss_clip": 0.01132486, + "auxiliary_loss_mlp": 0.01020595, + "balance_loss_clip": 1.04461217, + "balance_loss_mlp": 1.01350808, + "epoch": 0.7486322371189803, + "flos": 29937613104000.0, + "grad_norm": 1.4412620309893303, + "language_loss": 0.67983925, + "learning_rate": 6.268453405076943e-07, + "loss": 0.70137006, + "num_input_tokens_seen": 133888760, + "step": 6226, + "time_per_iteration": 2.5535829067230225 + }, + { + "auxiliary_loss_clip": 0.01137645, + "auxiliary_loss_mlp": 0.01020275, + "balance_loss_clip": 1.04396915, + "balance_loss_mlp": 1.01374507, + "epoch": 0.7487524800096195, + "flos": 18949414734720.0, + "grad_norm": 1.8144143569730453, + "language_loss": 0.82340175, + "learning_rate": 6.262790880528592e-07, + "loss": 0.84498096, + "num_input_tokens_seen": 133906380, + "step": 6227, + "time_per_iteration": 2.4477949142456055 + }, + { + "auxiliary_loss_clip": 0.01134435, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.04232669, + "balance_loss_mlp": 1.02282381, + "epoch": 0.7488727229002585, + "flos": 18697393935360.0, + "grad_norm": 2.344175243345799, + "language_loss": 0.79441434, + "learning_rate": 6.257130439903951e-07, + "loss": 0.81606305, + "num_input_tokens_seen": 133922875, + "step": 6228, + "time_per_iteration": 2.4687416553497314 + }, + { + "auxiliary_loss_clip": 0.01170136, + "auxiliary_loss_mlp": 0.01024206, + "balance_loss_clip": 1.05075121, + "balance_loss_mlp": 1.0170629, + "epoch": 0.7489929657908976, + "flos": 23623368168960.0, + "grad_norm": 1.8729090487936177, + "language_loss": 0.81070983, + "learning_rate": 6.251472084061695e-07, + "loss": 0.83265328, + "num_input_tokens_seen": 133941795, + "step": 6229, + "time_per_iteration": 2.4162254333496094 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01023761, + "balance_loss_clip": 1.04921317, + "balance_loss_mlp": 1.01695156, + "epoch": 0.7491132086815367, + "flos": 20551533056640.0, + "grad_norm": 1.876590929168162, + "language_loss": 0.88928324, + "learning_rate": 6.245815813860191e-07, + "loss": 0.91105366, + "num_input_tokens_seen": 133957305, + "step": 6230, + "time_per_iteration": 3.991645336151123 + }, + { + "auxiliary_loss_clip": 0.01169023, + "auxiliary_loss_mlp": 0.01023028, + "balance_loss_clip": 1.04729509, + "balance_loss_mlp": 1.01581001, + "epoch": 0.7492334515721758, + "flos": 23003011353600.0, + "grad_norm": 1.9645323292033885, + "language_loss": 0.70029616, + "learning_rate": 6.240161630157495e-07, + "loss": 0.72221667, + "num_input_tokens_seen": 133976660, + "step": 6231, + "time_per_iteration": 2.4113821983337402 + }, + { + "auxiliary_loss_clip": 0.01169446, + "auxiliary_loss_mlp": 0.01023414, + "balance_loss_clip": 1.04816628, + "balance_loss_mlp": 1.01628804, + "epoch": 0.7493536944628149, + "flos": 16398823835520.0, + "grad_norm": 1.84444668515726, + "language_loss": 0.70071959, + "learning_rate": 6.23450953381133e-07, + "loss": 0.72264814, + "num_input_tokens_seen": 133994750, + "step": 6232, + "time_per_iteration": 3.219994306564331 + }, + { + "auxiliary_loss_clip": 0.0113269, + "auxiliary_loss_mlp": 0.01023618, + "balance_loss_clip": 1.04464459, + "balance_loss_mlp": 1.01659024, + "epoch": 0.749473937353454, + "flos": 15338561155200.0, + "grad_norm": 1.9465253736845634, + "language_loss": 0.68101096, + "learning_rate": 6.228859525679131e-07, + "loss": 0.70257407, + "num_input_tokens_seen": 134009165, + "step": 6233, + "time_per_iteration": 2.4291491508483887 + }, + { + "auxiliary_loss_clip": 0.01151357, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.04587197, + "balance_loss_mlp": 1.01654911, + "epoch": 0.7495941802440931, + "flos": 18951138587520.0, + "grad_norm": 2.370861030985141, + "language_loss": 0.79979414, + "learning_rate": 6.223211606617986e-07, + "loss": 0.82154411, + "num_input_tokens_seen": 134027585, + "step": 6234, + "time_per_iteration": 2.4236137866973877 + }, + { + "auxiliary_loss_clip": 0.01151537, + "auxiliary_loss_mlp": 0.01023931, + "balance_loss_clip": 1.05012703, + "balance_loss_mlp": 1.01771128, + "epoch": 0.7497144231347321, + "flos": 22492469393280.0, + "grad_norm": 1.8198836772299694, + "language_loss": 0.83996075, + "learning_rate": 6.217565777484701e-07, + "loss": 0.86171544, + "num_input_tokens_seen": 134046680, + "step": 6235, + "time_per_iteration": 3.1726624965667725 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.00761634, + "balance_loss_clip": 1.04438329, + "balance_loss_mlp": 1.00038457, + "epoch": 0.7498346660253713, + "flos": 24243509502720.0, + "grad_norm": 1.7571962626905717, + "language_loss": 0.80194199, + "learning_rate": 6.211922039135722e-07, + "loss": 0.82090193, + "num_input_tokens_seen": 134066825, + "step": 6236, + "time_per_iteration": 2.5021450519561768 + }, + { + "auxiliary_loss_clip": 0.01168505, + "auxiliary_loss_mlp": 0.01025935, + "balance_loss_clip": 1.049384, + "balance_loss_mlp": 1.01885366, + "epoch": 0.7499549089160104, + "flos": 24387080163840.0, + "grad_norm": 2.9861287230376203, + "language_loss": 0.80835712, + "learning_rate": 6.206280392427201e-07, + "loss": 0.83030152, + "num_input_tokens_seen": 134086410, + "step": 6237, + "time_per_iteration": 2.4432663917541504 + }, + { + "auxiliary_loss_clip": 0.01146797, + "auxiliary_loss_mlp": 0.01021113, + "balance_loss_clip": 1.04471016, + "balance_loss_mlp": 1.01388574, + "epoch": 0.7500751518066494, + "flos": 34057320704640.0, + "grad_norm": 1.568529236453423, + "language_loss": 0.73682833, + "learning_rate": 6.200640838214983e-07, + "loss": 0.75850737, + "num_input_tokens_seen": 134109185, + "step": 6238, + "time_per_iteration": 2.5562078952789307 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.04849482, + "balance_loss_mlp": 1.01878011, + "epoch": 0.7501953946972886, + "flos": 18843586289280.0, + "grad_norm": 1.8420751057366733, + "language_loss": 0.66660023, + "learning_rate": 6.195003377354578e-07, + "loss": 0.68852854, + "num_input_tokens_seen": 134128455, + "step": 6239, + "time_per_iteration": 2.402932643890381 + }, + { + "auxiliary_loss_clip": 0.01150001, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.04455245, + "balance_loss_mlp": 1.02169895, + "epoch": 0.7503156375879276, + "flos": 20257675891200.0, + "grad_norm": 2.626204985629956, + "language_loss": 0.72696888, + "learning_rate": 6.189368010701183e-07, + "loss": 0.74876332, + "num_input_tokens_seen": 134145515, + "step": 6240, + "time_per_iteration": 2.4027717113494873 + }, + { + "auxiliary_loss_clip": 0.01157392, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.04523158, + "balance_loss_mlp": 1.01860261, + "epoch": 0.7504358804785667, + "flos": 13480040574720.0, + "grad_norm": 1.9453612773620705, + "language_loss": 0.76529676, + "learning_rate": 6.183734739109683e-07, + "loss": 0.78713024, + "num_input_tokens_seen": 134163335, + "step": 6241, + "time_per_iteration": 2.4118711948394775 + }, + { + "auxiliary_loss_clip": 0.01160642, + "auxiliary_loss_mlp": 0.01025354, + "balance_loss_clip": 1.04854369, + "balance_loss_mlp": 1.01773643, + "epoch": 0.7505561233692057, + "flos": 29461042431360.0, + "grad_norm": 2.2504640541617373, + "language_loss": 0.68508184, + "learning_rate": 6.178103563434629e-07, + "loss": 0.70694178, + "num_input_tokens_seen": 134182335, + "step": 6242, + "time_per_iteration": 2.481496572494507 + }, + { + "auxiliary_loss_clip": 0.01166838, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.04749024, + "balance_loss_mlp": 1.02251315, + "epoch": 0.7506763662598449, + "flos": 20302457172480.0, + "grad_norm": 1.7734571656123108, + "language_loss": 0.83663851, + "learning_rate": 6.172474484530283e-07, + "loss": 0.85860145, + "num_input_tokens_seen": 134201070, + "step": 6243, + "time_per_iteration": 2.3832345008850098 + }, + { + "auxiliary_loss_clip": 0.01129106, + "auxiliary_loss_mlp": 0.01025663, + "balance_loss_clip": 1.0410111, + "balance_loss_mlp": 1.01842117, + "epoch": 0.750796609150484, + "flos": 37230961939200.0, + "grad_norm": 2.171412603974266, + "language_loss": 0.75914609, + "learning_rate": 6.166847503250563e-07, + "loss": 0.78069377, + "num_input_tokens_seen": 134223310, + "step": 6244, + "time_per_iteration": 2.6111512184143066 + }, + { + "auxiliary_loss_clip": 0.01141789, + "auxiliary_loss_mlp": 0.01023911, + "balance_loss_clip": 1.04663026, + "balance_loss_mlp": 1.01670218, + "epoch": 0.750916852041123, + "flos": 19609417186560.0, + "grad_norm": 2.3001814639975025, + "language_loss": 0.79256475, + "learning_rate": 6.161222620449078e-07, + "loss": 0.81422174, + "num_input_tokens_seen": 134242085, + "step": 6245, + "time_per_iteration": 2.5093770027160645 + }, + { + "auxiliary_loss_clip": 0.01126965, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.04454315, + "balance_loss_mlp": 1.0212009, + "epoch": 0.7510370949317622, + "flos": 25112690807040.0, + "grad_norm": 6.03203468351411, + "language_loss": 0.79935658, + "learning_rate": 6.155599836979117e-07, + "loss": 0.82091165, + "num_input_tokens_seen": 134260770, + "step": 6246, + "time_per_iteration": 2.533597230911255 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.04131794, + "balance_loss_mlp": 1.02083826, + "epoch": 0.7511573378224012, + "flos": 19062282245760.0, + "grad_norm": 2.476737835506978, + "language_loss": 0.81687248, + "learning_rate": 6.149979153693649e-07, + "loss": 0.83826292, + "num_input_tokens_seen": 134278025, + "step": 6247, + "time_per_iteration": 2.517467498779297 + }, + { + "auxiliary_loss_clip": 0.01150882, + "auxiliary_loss_mlp": 0.01021136, + "balance_loss_clip": 1.04562283, + "balance_loss_mlp": 1.01387036, + "epoch": 0.7512775807130403, + "flos": 19937676602880.0, + "grad_norm": 1.9242567808821216, + "language_loss": 0.76984453, + "learning_rate": 6.144360571445343e-07, + "loss": 0.7915647, + "num_input_tokens_seen": 134297170, + "step": 6248, + "time_per_iteration": 2.4166879653930664 + }, + { + "auxiliary_loss_clip": 0.01153255, + "auxiliary_loss_mlp": 0.01024226, + "balance_loss_clip": 1.04967964, + "balance_loss_mlp": 1.01670694, + "epoch": 0.7513978236036795, + "flos": 20739920912640.0, + "grad_norm": 1.677912911373662, + "language_loss": 0.80133832, + "learning_rate": 6.138744091086509e-07, + "loss": 0.82311308, + "num_input_tokens_seen": 134316755, + "step": 6249, + "time_per_iteration": 2.4671390056610107 + }, + { + "auxiliary_loss_clip": 0.01129486, + "auxiliary_loss_mlp": 0.01023699, + "balance_loss_clip": 1.04612064, + "balance_loss_mlp": 1.01657593, + "epoch": 0.7515180664943185, + "flos": 27563163523200.0, + "grad_norm": 2.9477022416916894, + "language_loss": 0.72612911, + "learning_rate": 6.133129713469183e-07, + "loss": 0.74766099, + "num_input_tokens_seen": 134335960, + "step": 6250, + "time_per_iteration": 2.5370397567749023 + }, + { + "auxiliary_loss_clip": 0.0113391, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.04283834, + "balance_loss_mlp": 1.0167762, + "epoch": 0.7516383093849576, + "flos": 33803181002880.0, + "grad_norm": 1.642104574358395, + "language_loss": 0.63948023, + "learning_rate": 6.127517439445053e-07, + "loss": 0.66106409, + "num_input_tokens_seen": 134356805, + "step": 6251, + "time_per_iteration": 2.640925645828247 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.04175401, + "balance_loss_mlp": 1.01900971, + "epoch": 0.7517585522755967, + "flos": 29746172592000.0, + "grad_norm": 1.7587436804556467, + "language_loss": 0.81388497, + "learning_rate": 6.121907269865498e-07, + "loss": 0.83517826, + "num_input_tokens_seen": 134376295, + "step": 6252, + "time_per_iteration": 2.587433099746704 + }, + { + "auxiliary_loss_clip": 0.01033395, + "auxiliary_loss_mlp": 0.01001507, + "balance_loss_clip": 1.01297998, + "balance_loss_mlp": 1.00034511, + "epoch": 0.7518787951662358, + "flos": 69807974319360.0, + "grad_norm": 0.9252908462086723, + "language_loss": 0.67270464, + "learning_rate": 6.116299205581577e-07, + "loss": 0.69305366, + "num_input_tokens_seen": 134431125, + "step": 6253, + "time_per_iteration": 3.0550522804260254 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.05047369, + "balance_loss_mlp": 1.02153063, + "epoch": 0.7519990380568748, + "flos": 34203225749760.0, + "grad_norm": 1.9753732488135523, + "language_loss": 0.684681, + "learning_rate": 6.110693247444018e-07, + "loss": 0.70669746, + "num_input_tokens_seen": 134452960, + "step": 6254, + "time_per_iteration": 2.520676374435425 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01021024, + "balance_loss_clip": 1.04187322, + "balance_loss_mlp": 1.01428902, + "epoch": 0.752119280947514, + "flos": 21725704742400.0, + "grad_norm": 2.126531339211925, + "language_loss": 0.82428157, + "learning_rate": 6.105089396303258e-07, + "loss": 0.84561855, + "num_input_tokens_seen": 134471350, + "step": 6255, + "time_per_iteration": 2.487708568572998 + }, + { + "auxiliary_loss_clip": 0.01138127, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.04393554, + "balance_loss_mlp": 1.02078891, + "epoch": 0.7522395238381531, + "flos": 32742774668160.0, + "grad_norm": 2.0933952787177934, + "language_loss": 0.75474834, + "learning_rate": 6.099487653009383e-07, + "loss": 0.77641404, + "num_input_tokens_seen": 134490695, + "step": 6256, + "time_per_iteration": 2.5547077655792236 + }, + { + "auxiliary_loss_clip": 0.01150694, + "auxiliary_loss_mlp": 0.01021254, + "balance_loss_clip": 1.04513693, + "balance_loss_mlp": 1.01500177, + "epoch": 0.7523597667287921, + "flos": 23476026579840.0, + "grad_norm": 1.981453671770173, + "language_loss": 0.82870805, + "learning_rate": 6.093888018412192e-07, + "loss": 0.85042763, + "num_input_tokens_seen": 134506885, + "step": 6257, + "time_per_iteration": 4.064921140670776 + }, + { + "auxiliary_loss_clip": 0.01060228, + "auxiliary_loss_mlp": 0.01001605, + "balance_loss_clip": 1.01348948, + "balance_loss_mlp": 1.00044858, + "epoch": 0.7524800096194313, + "flos": 67346730501120.0, + "grad_norm": 0.706416242686214, + "language_loss": 0.54675937, + "learning_rate": 6.088290493361125e-07, + "loss": 0.56737769, + "num_input_tokens_seen": 134571770, + "step": 6258, + "time_per_iteration": 3.1746506690979004 + }, + { + "auxiliary_loss_clip": 0.01103374, + "auxiliary_loss_mlp": 0.01025143, + "balance_loss_clip": 1.04152441, + "balance_loss_mlp": 1.01802027, + "epoch": 0.7526002525100703, + "flos": 13006055681280.0, + "grad_norm": 2.1684632685671823, + "language_loss": 0.71786094, + "learning_rate": 6.082695078705322e-07, + "loss": 0.73914611, + "num_input_tokens_seen": 134589250, + "step": 6259, + "time_per_iteration": 3.35184907913208 + }, + { + "auxiliary_loss_clip": 0.01146666, + "auxiliary_loss_mlp": 0.01025885, + "balance_loss_clip": 1.04649723, + "balance_loss_mlp": 1.01827335, + "epoch": 0.7527204954007094, + "flos": 21397229844480.0, + "grad_norm": 2.0456011277912367, + "language_loss": 0.69125962, + "learning_rate": 6.077101775293618e-07, + "loss": 0.71298516, + "num_input_tokens_seen": 134608075, + "step": 6260, + "time_per_iteration": 2.4235687255859375 + }, + { + "auxiliary_loss_clip": 0.01155083, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.04685998, + "balance_loss_mlp": 1.02077448, + "epoch": 0.7528407382913486, + "flos": 18947188091520.0, + "grad_norm": 2.4849077527889643, + "language_loss": 0.82339543, + "learning_rate": 6.071510583974504e-07, + "loss": 0.84523457, + "num_input_tokens_seen": 134623260, + "step": 6261, + "time_per_iteration": 2.3993842601776123 + }, + { + "auxiliary_loss_clip": 0.01170101, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.04961562, + "balance_loss_mlp": 1.02209187, + "epoch": 0.7529609811819876, + "flos": 15231798956160.0, + "grad_norm": 1.8486718868682808, + "language_loss": 0.71922195, + "learning_rate": 6.065921505596161e-07, + "loss": 0.74121767, + "num_input_tokens_seen": 134641540, + "step": 6262, + "time_per_iteration": 3.09307599067688 + }, + { + "auxiliary_loss_clip": 0.01125493, + "auxiliary_loss_mlp": 0.01023624, + "balance_loss_clip": 1.04510975, + "balance_loss_mlp": 1.01632512, + "epoch": 0.7530812240726267, + "flos": 19354487385600.0, + "grad_norm": 1.6740837587178967, + "language_loss": 0.77016187, + "learning_rate": 6.060334541006445e-07, + "loss": 0.79165304, + "num_input_tokens_seen": 134660035, + "step": 6263, + "time_per_iteration": 2.4801292419433594 + }, + { + "auxiliary_loss_clip": 0.01127439, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.04247904, + "balance_loss_mlp": 1.01879549, + "epoch": 0.7532014669632658, + "flos": 27748247328000.0, + "grad_norm": 1.4816375557735484, + "language_loss": 0.69224095, + "learning_rate": 6.05474969105289e-07, + "loss": 0.71377319, + "num_input_tokens_seen": 134683025, + "step": 6264, + "time_per_iteration": 2.579890727996826 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01022254, + "balance_loss_clip": 1.04752994, + "balance_loss_mlp": 1.01479459, + "epoch": 0.7533217098539049, + "flos": 14137421333760.0, + "grad_norm": 2.1586156920426607, + "language_loss": 0.73210609, + "learning_rate": 6.049166956582725e-07, + "loss": 0.75387383, + "num_input_tokens_seen": 134701290, + "step": 6265, + "time_per_iteration": 2.4073972702026367 + }, + { + "auxiliary_loss_clip": 0.01148341, + "auxiliary_loss_mlp": 0.01019485, + "balance_loss_clip": 1.04510522, + "balance_loss_mlp": 1.01263928, + "epoch": 0.753441952744544, + "flos": 26429068437120.0, + "grad_norm": 2.938928415162882, + "language_loss": 0.87530792, + "learning_rate": 6.043586338442841e-07, + "loss": 0.89698619, + "num_input_tokens_seen": 134720345, + "step": 6266, + "time_per_iteration": 2.4748902320861816 + }, + { + "auxiliary_loss_clip": 0.01164837, + "auxiliary_loss_mlp": 0.01021315, + "balance_loss_clip": 1.04902983, + "balance_loss_mlp": 1.01508367, + "epoch": 0.7535621956351831, + "flos": 23878621192320.0, + "grad_norm": 1.7953251826041832, + "language_loss": 0.73136187, + "learning_rate": 6.038007837479815e-07, + "loss": 0.75322342, + "num_input_tokens_seen": 134741450, + "step": 6267, + "time_per_iteration": 2.4239704608917236 + }, + { + "auxiliary_loss_clip": 0.01150586, + "auxiliary_loss_mlp": 0.01026156, + "balance_loss_clip": 1.04704654, + "balance_loss_mlp": 1.0188092, + "epoch": 0.7536824385258222, + "flos": 21795873960960.0, + "grad_norm": 1.9848182025672334, + "language_loss": 0.64212251, + "learning_rate": 6.032431454539897e-07, + "loss": 0.66389, + "num_input_tokens_seen": 134760295, + "step": 6268, + "time_per_iteration": 2.4287822246551514 + }, + { + "auxiliary_loss_clip": 0.01127872, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.04533052, + "balance_loss_mlp": 1.02164364, + "epoch": 0.7538026814164612, + "flos": 28911644933760.0, + "grad_norm": 1.8428271433944978, + "language_loss": 0.81465751, + "learning_rate": 6.026857190469014e-07, + "loss": 0.83621871, + "num_input_tokens_seen": 134782050, + "step": 6269, + "time_per_iteration": 2.5652706623077393 + }, + { + "auxiliary_loss_clip": 0.01140396, + "auxiliary_loss_mlp": 0.01021166, + "balance_loss_clip": 1.04469585, + "balance_loss_mlp": 1.01370335, + "epoch": 0.7539229243071004, + "flos": 21104701482240.0, + "grad_norm": 6.019051247493286, + "language_loss": 0.74187309, + "learning_rate": 6.0212850461128e-07, + "loss": 0.76348871, + "num_input_tokens_seen": 134801170, + "step": 6270, + "time_per_iteration": 2.4639132022857666 + }, + { + "auxiliary_loss_clip": 0.01142583, + "auxiliary_loss_mlp": 0.01025749, + "balance_loss_clip": 1.04521811, + "balance_loss_mlp": 1.01805985, + "epoch": 0.7540431671977395, + "flos": 15158469340800.0, + "grad_norm": 2.074958772768113, + "language_loss": 0.74484587, + "learning_rate": 6.015715022316516e-07, + "loss": 0.76652914, + "num_input_tokens_seen": 134819150, + "step": 6271, + "time_per_iteration": 2.4526193141937256 + }, + { + "auxiliary_loss_clip": 0.01111515, + "auxiliary_loss_mlp": 0.01021022, + "balance_loss_clip": 1.04073691, + "balance_loss_mlp": 1.01323223, + "epoch": 0.7541634100883785, + "flos": 18770579896320.0, + "grad_norm": 2.8255669662731067, + "language_loss": 0.78141332, + "learning_rate": 6.010147119925154e-07, + "loss": 0.80273867, + "num_input_tokens_seen": 134836905, + "step": 6272, + "time_per_iteration": 2.5063862800598145 + }, + { + "auxiliary_loss_clip": 0.01117236, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.04290366, + "balance_loss_mlp": 1.01656091, + "epoch": 0.7542836529790176, + "flos": 20594770053120.0, + "grad_norm": 1.975469233250893, + "language_loss": 0.66183078, + "learning_rate": 6.004581339783348e-07, + "loss": 0.68323946, + "num_input_tokens_seen": 134855225, + "step": 6273, + "time_per_iteration": 2.518101930618286 + }, + { + "auxiliary_loss_clip": 0.01157985, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.04805088, + "balance_loss_mlp": 1.02433991, + "epoch": 0.7544038958696567, + "flos": 19095104298240.0, + "grad_norm": 2.6312171637024746, + "language_loss": 0.68493694, + "learning_rate": 5.999017682735425e-07, + "loss": 0.70683944, + "num_input_tokens_seen": 134871615, + "step": 6274, + "time_per_iteration": 2.5629022121429443 + }, + { + "auxiliary_loss_clip": 0.01104552, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.04253376, + "balance_loss_mlp": 1.01959562, + "epoch": 0.7545241387602958, + "flos": 31723306859520.0, + "grad_norm": 2.002320912570464, + "language_loss": 0.66431183, + "learning_rate": 5.993456149625387e-07, + "loss": 0.68562829, + "num_input_tokens_seen": 134892765, + "step": 6275, + "time_per_iteration": 2.713529348373413 + }, + { + "auxiliary_loss_clip": 0.0111629, + "auxiliary_loss_mlp": 0.01024479, + "balance_loss_clip": 1.04333472, + "balance_loss_mlp": 1.01787746, + "epoch": 0.7546443816509348, + "flos": 20296495514880.0, + "grad_norm": 1.7324732139310497, + "language_loss": 0.82305419, + "learning_rate": 5.987896741296909e-07, + "loss": 0.8444618, + "num_input_tokens_seen": 134910505, + "step": 6276, + "time_per_iteration": 2.501146078109741 + }, + { + "auxiliary_loss_clip": 0.01141642, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.04825306, + "balance_loss_mlp": 1.02077234, + "epoch": 0.754764624541574, + "flos": 23696159080320.0, + "grad_norm": 2.4922362202068573, + "language_loss": 0.783961, + "learning_rate": 5.982339458593361e-07, + "loss": 0.80565774, + "num_input_tokens_seen": 134930445, + "step": 6277, + "time_per_iteration": 2.5138630867004395 + }, + { + "auxiliary_loss_clip": 0.011516, + "auxiliary_loss_mlp": 0.00761755, + "balance_loss_clip": 1.04850817, + "balance_loss_mlp": 1.00035167, + "epoch": 0.7548848674322131, + "flos": 25337204766720.0, + "grad_norm": 1.5599870295067508, + "language_loss": 0.83855003, + "learning_rate": 5.976784302357767e-07, + "loss": 0.8576836, + "num_input_tokens_seen": 134951010, + "step": 6278, + "time_per_iteration": 2.4912402629852295 + }, + { + "auxiliary_loss_clip": 0.01156583, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.04810667, + "balance_loss_mlp": 1.01972926, + "epoch": 0.7550051103228521, + "flos": 19573147428480.0, + "grad_norm": 1.8807749057620473, + "language_loss": 0.73131055, + "learning_rate": 5.971231273432855e-07, + "loss": 0.7531451, + "num_input_tokens_seen": 134970495, + "step": 6279, + "time_per_iteration": 2.426175355911255 + }, + { + "auxiliary_loss_clip": 0.01060137, + "auxiliary_loss_mlp": 0.0100264, + "balance_loss_clip": 1.01441216, + "balance_loss_mlp": 1.0015732, + "epoch": 0.7551253532134913, + "flos": 64150068648960.0, + "grad_norm": 0.8104748717816136, + "language_loss": 0.54582214, + "learning_rate": 5.965680372661e-07, + "loss": 0.56644988, + "num_input_tokens_seen": 135028060, + "step": 6280, + "time_per_iteration": 2.947946310043335 + }, + { + "auxiliary_loss_clip": 0.01140063, + "auxiliary_loss_mlp": 0.01022911, + "balance_loss_clip": 1.04693937, + "balance_loss_mlp": 1.01661992, + "epoch": 0.7552455961041303, + "flos": 26067986968320.0, + "grad_norm": 1.8685220353776606, + "language_loss": 0.5656051, + "learning_rate": 5.960131600884266e-07, + "loss": 0.58723485, + "num_input_tokens_seen": 135047330, + "step": 6281, + "time_per_iteration": 2.5105249881744385 + }, + { + "auxiliary_loss_clip": 0.01129489, + "auxiliary_loss_mlp": 0.01022468, + "balance_loss_clip": 1.04580986, + "balance_loss_mlp": 1.01599193, + "epoch": 0.7553658389947694, + "flos": 24498223822080.0, + "grad_norm": 1.7390483990100314, + "language_loss": 0.7589674, + "learning_rate": 5.954584958944413e-07, + "loss": 0.78048694, + "num_input_tokens_seen": 135065995, + "step": 6282, + "time_per_iteration": 2.52677059173584 + }, + { + "auxiliary_loss_clip": 0.01127613, + "auxiliary_loss_mlp": 0.00761876, + "balance_loss_clip": 1.04241645, + "balance_loss_mlp": 1.00032043, + "epoch": 0.7554860818854086, + "flos": 21799465320960.0, + "grad_norm": 1.8134519638142823, + "language_loss": 0.81914151, + "learning_rate": 5.949040447682854e-07, + "loss": 0.83803642, + "num_input_tokens_seen": 135085820, + "step": 6283, + "time_per_iteration": 3.2417562007904053 + }, + { + "auxiliary_loss_clip": 0.0114508, + "auxiliary_loss_mlp": 0.01023518, + "balance_loss_clip": 1.0461179, + "balance_loss_mlp": 1.01664615, + "epoch": 0.7556063247760476, + "flos": 16362123114240.0, + "grad_norm": 3.0804674350465837, + "language_loss": 0.68603671, + "learning_rate": 5.943498067940686e-07, + "loss": 0.70772272, + "num_input_tokens_seen": 135102845, + "step": 6284, + "time_per_iteration": 3.283679723739624 + }, + { + "auxiliary_loss_clip": 0.01140149, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.05275655, + "balance_loss_mlp": 1.01999843, + "epoch": 0.7557265676666867, + "flos": 27235155502080.0, + "grad_norm": 1.6548234805581539, + "language_loss": 0.81631684, + "learning_rate": 5.937957820558686e-07, + "loss": 0.8379916, + "num_input_tokens_seen": 135122190, + "step": 6285, + "time_per_iteration": 3.3709068298339844 + }, + { + "auxiliary_loss_clip": 0.01049981, + "auxiliary_loss_mlp": 0.01001505, + "balance_loss_clip": 1.01219296, + "balance_loss_mlp": 1.00046813, + "epoch": 0.7558468105573258, + "flos": 62189131415040.0, + "grad_norm": 0.8473369133802555, + "language_loss": 0.65465021, + "learning_rate": 5.932419706377296e-07, + "loss": 0.67516506, + "num_input_tokens_seen": 135180495, + "step": 6286, + "time_per_iteration": 3.029397487640381 + }, + { + "auxiliary_loss_clip": 0.01125351, + "auxiliary_loss_mlp": 0.01022959, + "balance_loss_clip": 1.04825294, + "balance_loss_mlp": 1.01568711, + "epoch": 0.7559670534479649, + "flos": 33249078823680.0, + "grad_norm": 1.8219643399996757, + "language_loss": 0.74012923, + "learning_rate": 5.92688372623666e-07, + "loss": 0.76161242, + "num_input_tokens_seen": 135199200, + "step": 6287, + "time_per_iteration": 2.5882673263549805 + }, + { + "auxiliary_loss_clip": 0.01153921, + "auxiliary_loss_mlp": 0.01022768, + "balance_loss_clip": 1.04545832, + "balance_loss_mlp": 1.01556754, + "epoch": 0.7560872963386039, + "flos": 14064379027200.0, + "grad_norm": 2.0510412323897054, + "language_loss": 0.73849952, + "learning_rate": 5.921349880976574e-07, + "loss": 0.76026642, + "num_input_tokens_seen": 135217035, + "step": 6288, + "time_per_iteration": 3.205052137374878 + }, + { + "auxiliary_loss_clip": 0.01142175, + "auxiliary_loss_mlp": 0.00762325, + "balance_loss_clip": 1.04425597, + "balance_loss_mlp": 1.00038171, + "epoch": 0.7562075392292431, + "flos": 20412307941120.0, + "grad_norm": 1.8696031497189385, + "language_loss": 0.82365584, + "learning_rate": 5.915818171436515e-07, + "loss": 0.84270084, + "num_input_tokens_seen": 135236370, + "step": 6289, + "time_per_iteration": 2.487180233001709 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01025319, + "balance_loss_clip": 1.04262042, + "balance_loss_mlp": 1.01832187, + "epoch": 0.7563277821198822, + "flos": 20376792368640.0, + "grad_norm": 2.262014802609539, + "language_loss": 0.74437463, + "learning_rate": 5.910288598455642e-07, + "loss": 0.76601505, + "num_input_tokens_seen": 135255720, + "step": 6290, + "time_per_iteration": 2.4932262897491455 + }, + { + "auxiliary_loss_clip": 0.01159518, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.04856634, + "balance_loss_mlp": 1.02306581, + "epoch": 0.7564480250105212, + "flos": 18588261438720.0, + "grad_norm": 2.1522170690851654, + "language_loss": 0.74586409, + "learning_rate": 5.90476116287278e-07, + "loss": 0.76776308, + "num_input_tokens_seen": 135273320, + "step": 6291, + "time_per_iteration": 2.4486305713653564 + }, + { + "auxiliary_loss_clip": 0.01140761, + "auxiliary_loss_mlp": 0.0102789, + "balance_loss_clip": 1.04886472, + "balance_loss_mlp": 1.02071905, + "epoch": 0.7565682679011604, + "flos": 21215521918080.0, + "grad_norm": 1.8188353913665978, + "language_loss": 0.68033367, + "learning_rate": 5.899235865526456e-07, + "loss": 0.70202017, + "num_input_tokens_seen": 135292615, + "step": 6292, + "time_per_iteration": 2.476522922515869 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.04239488, + "balance_loss_mlp": 1.01904321, + "epoch": 0.7566885107917994, + "flos": 20449008662400.0, + "grad_norm": 1.7866296552006307, + "language_loss": 0.82062238, + "learning_rate": 5.893712707254825e-07, + "loss": 0.84204674, + "num_input_tokens_seen": 135310075, + "step": 6293, + "time_per_iteration": 2.4998016357421875 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01023374, + "balance_loss_clip": 1.04016626, + "balance_loss_mlp": 1.01555347, + "epoch": 0.7568087536824385, + "flos": 19025832919680.0, + "grad_norm": 2.5138350037075416, + "language_loss": 0.65726644, + "learning_rate": 5.888191688895769e-07, + "loss": 0.67856061, + "num_input_tokens_seen": 135327335, + "step": 6294, + "time_per_iteration": 2.5095326900482178 + }, + { + "auxiliary_loss_clip": 0.01167241, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.04636717, + "balance_loss_mlp": 1.0193038, + "epoch": 0.7569289965730777, + "flos": 15225442248960.0, + "grad_norm": 2.2802058498177815, + "language_loss": 0.61875337, + "learning_rate": 5.882672811286813e-07, + "loss": 0.64069855, + "num_input_tokens_seen": 135343615, + "step": 6295, + "time_per_iteration": 2.3773233890533447 + }, + { + "auxiliary_loss_clip": 0.01169594, + "auxiliary_loss_mlp": 0.01026128, + "balance_loss_clip": 1.04825592, + "balance_loss_mlp": 1.01882625, + "epoch": 0.7570492394637167, + "flos": 20769367086720.0, + "grad_norm": 2.0122279612698626, + "language_loss": 0.69431567, + "learning_rate": 5.877156075265166e-07, + "loss": 0.71627289, + "num_input_tokens_seen": 135359880, + "step": 6296, + "time_per_iteration": 2.3994789123535156 + }, + { + "auxiliary_loss_clip": 0.01139063, + "auxiliary_loss_mlp": 0.01023036, + "balance_loss_clip": 1.04427767, + "balance_loss_mlp": 1.01538897, + "epoch": 0.7571694823543558, + "flos": 15664091137920.0, + "grad_norm": 2.568869135287148, + "language_loss": 0.69711959, + "learning_rate": 5.871641481667715e-07, + "loss": 0.71874058, + "num_input_tokens_seen": 135374325, + "step": 6297, + "time_per_iteration": 2.4964351654052734 + }, + { + "auxiliary_loss_clip": 0.01114522, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.04375172, + "balance_loss_mlp": 1.02252758, + "epoch": 0.7572897252449949, + "flos": 25409241492480.0, + "grad_norm": 1.8676425103516465, + "language_loss": 0.84341598, + "learning_rate": 5.866129031331011e-07, + "loss": 0.86486351, + "num_input_tokens_seen": 135393980, + "step": 6298, + "time_per_iteration": 2.567317008972168 + }, + { + "auxiliary_loss_clip": 0.01141324, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.04521346, + "balance_loss_mlp": 1.01779151, + "epoch": 0.757409968135634, + "flos": 24279348297600.0, + "grad_norm": 1.9529690391343197, + "language_loss": 0.83697963, + "learning_rate": 5.8606187250913e-07, + "loss": 0.85864532, + "num_input_tokens_seen": 135412030, + "step": 6299, + "time_per_iteration": 2.510033130645752 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.00761668, + "balance_loss_clip": 1.04941511, + "balance_loss_mlp": 1.0003686, + "epoch": 0.757530211026273, + "flos": 24133766474880.0, + "grad_norm": 2.0252841485884705, + "language_loss": 0.84210235, + "learning_rate": 5.855110563784482e-07, + "loss": 0.86126566, + "num_input_tokens_seen": 135430565, + "step": 6300, + "time_per_iteration": 2.453111171722412 + }, + { + "auxiliary_loss_clip": 0.01147931, + "auxiliary_loss_mlp": 0.00761874, + "balance_loss_clip": 1.04491329, + "balance_loss_mlp": 1.00032973, + "epoch": 0.7576504539169122, + "flos": 23951807153280.0, + "grad_norm": 2.119686534995756, + "language_loss": 0.64331168, + "learning_rate": 5.849604548246156e-07, + "loss": 0.66240972, + "num_input_tokens_seen": 135451675, + "step": 6301, + "time_per_iteration": 2.470508337020874 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.00761639, + "balance_loss_clip": 1.04802024, + "balance_loss_mlp": 1.00039577, + "epoch": 0.7577706968075513, + "flos": 21251360712960.0, + "grad_norm": 2.0044358807027254, + "language_loss": 0.80423015, + "learning_rate": 5.844100679311565e-07, + "loss": 0.8233012, + "num_input_tokens_seen": 135470635, + "step": 6302, + "time_per_iteration": 2.495260238647461 + }, + { + "auxiliary_loss_clip": 0.01143786, + "auxiliary_loss_mlp": 0.01022826, + "balance_loss_clip": 1.04951477, + "balance_loss_mlp": 1.01506507, + "epoch": 0.7578909396981903, + "flos": 18296595002880.0, + "grad_norm": 2.768617825031151, + "language_loss": 0.76597059, + "learning_rate": 5.838598957815637e-07, + "loss": 0.78763676, + "num_input_tokens_seen": 135487865, + "step": 6303, + "time_per_iteration": 2.4927802085876465 + }, + { + "auxiliary_loss_clip": 0.01132554, + "auxiliary_loss_mlp": 0.01023391, + "balance_loss_clip": 1.04364729, + "balance_loss_mlp": 1.01637888, + "epoch": 0.7580111825888295, + "flos": 25373869574400.0, + "grad_norm": 1.5876508215880005, + "language_loss": 0.85302413, + "learning_rate": 5.833099384592996e-07, + "loss": 0.8745836, + "num_input_tokens_seen": 135508440, + "step": 6304, + "time_per_iteration": 2.5008881092071533 + }, + { + "auxiliary_loss_clip": 0.01134277, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.04370856, + "balance_loss_mlp": 1.01822841, + "epoch": 0.7581314254794685, + "flos": 23768662682880.0, + "grad_norm": 2.2896850810795932, + "language_loss": 0.7142691, + "learning_rate": 5.827601960477913e-07, + "loss": 0.73586798, + "num_input_tokens_seen": 135526365, + "step": 6305, + "time_per_iteration": 2.475066900253296 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.04398727, + "balance_loss_mlp": 1.02036834, + "epoch": 0.7582516683701076, + "flos": 22054610603520.0, + "grad_norm": 2.4584644419153996, + "language_loss": 0.70667589, + "learning_rate": 5.822106686304344e-07, + "loss": 0.72843021, + "num_input_tokens_seen": 135545655, + "step": 6306, + "time_per_iteration": 2.4346423149108887 + }, + { + "auxiliary_loss_clip": 0.01133031, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.04505849, + "balance_loss_mlp": 1.01981211, + "epoch": 0.7583719112607467, + "flos": 31649725848960.0, + "grad_norm": 1.6930919223942935, + "language_loss": 0.57783252, + "learning_rate": 5.816613562905919e-07, + "loss": 0.59943438, + "num_input_tokens_seen": 135566840, + "step": 6307, + "time_per_iteration": 2.568549156188965 + }, + { + "auxiliary_loss_clip": 0.01125657, + "auxiliary_loss_mlp": 0.01025938, + "balance_loss_clip": 1.04938924, + "balance_loss_mlp": 1.0185889, + "epoch": 0.7584921541513858, + "flos": 33068376478080.0, + "grad_norm": 1.5333518695777595, + "language_loss": 0.70068789, + "learning_rate": 5.811122591115933e-07, + "loss": 0.72220385, + "num_input_tokens_seen": 135587825, + "step": 6308, + "time_per_iteration": 2.584228038787842 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.05008614, + "balance_loss_mlp": 1.01992035, + "epoch": 0.7586123970420249, + "flos": 23326350606720.0, + "grad_norm": 2.238996947319549, + "language_loss": 0.71755725, + "learning_rate": 5.805633771767376e-07, + "loss": 0.73911279, + "num_input_tokens_seen": 135605220, + "step": 6309, + "time_per_iteration": 2.510951042175293 + }, + { + "auxiliary_loss_clip": 0.0113747, + "auxiliary_loss_mlp": 0.01023247, + "balance_loss_clip": 1.04668093, + "balance_loss_mlp": 1.01530159, + "epoch": 0.7587326399326639, + "flos": 18334229477760.0, + "grad_norm": 1.6933524051913462, + "language_loss": 0.77881581, + "learning_rate": 5.800147105692888e-07, + "loss": 0.80042303, + "num_input_tokens_seen": 135624795, + "step": 6310, + "time_per_iteration": 3.3414499759674072 + }, + { + "auxiliary_loss_clip": 0.01153978, + "auxiliary_loss_mlp": 0.0102343, + "balance_loss_clip": 1.04512882, + "balance_loss_mlp": 1.01655173, + "epoch": 0.7588528828233031, + "flos": 17275080119040.0, + "grad_norm": 1.9194437294654993, + "language_loss": 0.79095036, + "learning_rate": 5.794662593724795e-07, + "loss": 0.81272447, + "num_input_tokens_seen": 135643800, + "step": 6311, + "time_per_iteration": 3.282911777496338 + }, + { + "auxiliary_loss_clip": 0.01171036, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.05131805, + "balance_loss_mlp": 1.02155638, + "epoch": 0.7589731257139422, + "flos": 17713621267200.0, + "grad_norm": 1.8620825579727243, + "language_loss": 0.74696571, + "learning_rate": 5.789180236695091e-07, + "loss": 0.76896739, + "num_input_tokens_seen": 135660655, + "step": 6312, + "time_per_iteration": 2.3927524089813232 + }, + { + "auxiliary_loss_clip": 0.01148521, + "auxiliary_loss_mlp": 0.01026573, + "balance_loss_clip": 1.04728293, + "balance_loss_mlp": 1.02004337, + "epoch": 0.7590933686045812, + "flos": 15961072786560.0, + "grad_norm": 1.9442211615831841, + "language_loss": 0.85097373, + "learning_rate": 5.78370003543544e-07, + "loss": 0.87272465, + "num_input_tokens_seen": 135679410, + "step": 6313, + "time_per_iteration": 2.412609338760376 + }, + { + "auxiliary_loss_clip": 0.01154447, + "auxiliary_loss_mlp": 0.00761981, + "balance_loss_clip": 1.04832172, + "balance_loss_mlp": 1.00038409, + "epoch": 0.7592136114952204, + "flos": 21068072588160.0, + "grad_norm": 2.083486064330563, + "language_loss": 0.8371017, + "learning_rate": 5.778221990777203e-07, + "loss": 0.85626602, + "num_input_tokens_seen": 135697150, + "step": 6314, + "time_per_iteration": 2.4422402381896973 + }, + { + "auxiliary_loss_clip": 0.0114386, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.04937983, + "balance_loss_mlp": 1.01949739, + "epoch": 0.7593338543858594, + "flos": 25297666871040.0, + "grad_norm": 2.5559611022323057, + "language_loss": 0.82623255, + "learning_rate": 5.772746103551372e-07, + "loss": 0.84794039, + "num_input_tokens_seen": 135712545, + "step": 6315, + "time_per_iteration": 3.2547085285186768 + }, + { + "auxiliary_loss_clip": 0.01138076, + "auxiliary_loss_mlp": 0.01021115, + "balance_loss_clip": 1.04687536, + "balance_loss_mlp": 1.01356304, + "epoch": 0.7594540972764985, + "flos": 31832367528960.0, + "grad_norm": 1.7597377282269855, + "language_loss": 0.72190166, + "learning_rate": 5.767272374588648e-07, + "loss": 0.74349356, + "num_input_tokens_seen": 135733950, + "step": 6316, + "time_per_iteration": 2.556236743927002 + }, + { + "auxiliary_loss_clip": 0.0115302, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.04926753, + "balance_loss_mlp": 1.01697111, + "epoch": 0.7595743401671377, + "flos": 37597250880000.0, + "grad_norm": 1.6275790280984483, + "language_loss": 0.77904081, + "learning_rate": 5.76180080471939e-07, + "loss": 0.80081546, + "num_input_tokens_seen": 135757120, + "step": 6317, + "time_per_iteration": 2.581899404525757 + }, + { + "auxiliary_loss_clip": 0.01172257, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.04931593, + "balance_loss_mlp": 1.01769018, + "epoch": 0.7596945830577767, + "flos": 18287724343680.0, + "grad_norm": 2.3262235525151467, + "language_loss": 0.72180593, + "learning_rate": 5.756331394773631e-07, + "loss": 0.74378324, + "num_input_tokens_seen": 135773335, + "step": 6318, + "time_per_iteration": 2.3827598094940186 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.00762549, + "balance_loss_clip": 1.04296005, + "balance_loss_mlp": 1.00030613, + "epoch": 0.7598148259484158, + "flos": 22233122219520.0, + "grad_norm": 1.696122889528536, + "language_loss": 0.76261896, + "learning_rate": 5.750864145581071e-07, + "loss": 0.78124416, + "num_input_tokens_seen": 135792555, + "step": 6319, + "time_per_iteration": 2.5931317806243896 + }, + { + "auxiliary_loss_clip": 0.01168761, + "auxiliary_loss_mlp": 0.0102676, + "balance_loss_clip": 1.0505681, + "balance_loss_mlp": 1.01969683, + "epoch": 0.7599350688390549, + "flos": 27161718145920.0, + "grad_norm": 1.9137831335682258, + "language_loss": 0.86044788, + "learning_rate": 5.745399057971085e-07, + "loss": 0.88240302, + "num_input_tokens_seen": 135813690, + "step": 6320, + "time_per_iteration": 2.442300796508789 + }, + { + "auxiliary_loss_clip": 0.01158099, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.0477246, + "balance_loss_mlp": 1.01948154, + "epoch": 0.760055311729694, + "flos": 15560704817280.0, + "grad_norm": 2.6722157325804483, + "language_loss": 0.75298023, + "learning_rate": 5.739936132772738e-07, + "loss": 0.77482575, + "num_input_tokens_seen": 135832255, + "step": 6321, + "time_per_iteration": 2.4137682914733887 + }, + { + "auxiliary_loss_clip": 0.01166094, + "auxiliary_loss_mlp": 0.01025137, + "balance_loss_clip": 1.04670417, + "balance_loss_mlp": 1.01763618, + "epoch": 0.760175554620333, + "flos": 25155496840320.0, + "grad_norm": 1.9331697096220553, + "language_loss": 0.74442792, + "learning_rate": 5.734475370814733e-07, + "loss": 0.76634014, + "num_input_tokens_seen": 135851935, + "step": 6322, + "time_per_iteration": 2.423246145248413 + }, + { + "auxiliary_loss_clip": 0.01154586, + "auxiliary_loss_mlp": 0.01022408, + "balance_loss_clip": 1.04521108, + "balance_loss_mlp": 1.01519561, + "epoch": 0.7602957975109722, + "flos": 24353791234560.0, + "grad_norm": 1.6153738544491134, + "language_loss": 0.78686702, + "learning_rate": 5.729016772925483e-07, + "loss": 0.80863696, + "num_input_tokens_seen": 135873510, + "step": 6323, + "time_per_iteration": 2.457078456878662 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01023412, + "balance_loss_clip": 1.04593015, + "balance_loss_mlp": 1.01565695, + "epoch": 0.7604160404016113, + "flos": 25192664438400.0, + "grad_norm": 1.9692730370201807, + "language_loss": 0.70632821, + "learning_rate": 5.723560339933038e-07, + "loss": 0.7276687, + "num_input_tokens_seen": 135893845, + "step": 6324, + "time_per_iteration": 2.5802526473999023 + }, + { + "auxiliary_loss_clip": 0.01151902, + "auxiliary_loss_mlp": 0.00761791, + "balance_loss_clip": 1.04662216, + "balance_loss_mlp": 1.00032759, + "epoch": 0.7605362832922503, + "flos": 29861841363840.0, + "grad_norm": 2.3544129384779113, + "language_loss": 0.65566218, + "learning_rate": 5.71810607266513e-07, + "loss": 0.67479908, + "num_input_tokens_seen": 135912430, + "step": 6325, + "time_per_iteration": 2.4846885204315186 + }, + { + "auxiliary_loss_clip": 0.01153364, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.04577374, + "balance_loss_mlp": 1.01741421, + "epoch": 0.7606565261828895, + "flos": 13917935278080.0, + "grad_norm": 2.1290613247396397, + "language_loss": 0.60401756, + "learning_rate": 5.712653971949184e-07, + "loss": 0.62579566, + "num_input_tokens_seen": 135930550, + "step": 6326, + "time_per_iteration": 2.4105725288391113 + }, + { + "auxiliary_loss_clip": 0.01148444, + "auxiliary_loss_mlp": 0.01022567, + "balance_loss_clip": 1.04608309, + "balance_loss_mlp": 1.01508641, + "epoch": 0.7607767690735285, + "flos": 18551273408640.0, + "grad_norm": 2.197977500654092, + "language_loss": 0.7530911, + "learning_rate": 5.707204038612268e-07, + "loss": 0.77480125, + "num_input_tokens_seen": 135947980, + "step": 6327, + "time_per_iteration": 2.3939802646636963 + }, + { + "auxiliary_loss_clip": 0.01151838, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.05369282, + "balance_loss_mlp": 1.02002311, + "epoch": 0.7608970119641676, + "flos": 20922993555840.0, + "grad_norm": 4.1618849003005325, + "language_loss": 0.74070084, + "learning_rate": 5.701756273481138e-07, + "loss": 0.76249731, + "num_input_tokens_seen": 135965400, + "step": 6328, + "time_per_iteration": 2.4617631435394287 + }, + { + "auxiliary_loss_clip": 0.01143758, + "auxiliary_loss_mlp": 0.01023547, + "balance_loss_clip": 1.04528272, + "balance_loss_mlp": 1.01641846, + "epoch": 0.7610172548548068, + "flos": 23807302738560.0, + "grad_norm": 1.4458777093984885, + "language_loss": 0.73968965, + "learning_rate": 5.696310677382212e-07, + "loss": 0.76136267, + "num_input_tokens_seen": 135986795, + "step": 6329, + "time_per_iteration": 2.492581367492676 + }, + { + "auxiliary_loss_clip": 0.01030973, + "auxiliary_loss_mlp": 0.01001464, + "balance_loss_clip": 1.01165676, + "balance_loss_mlp": 1.00058734, + "epoch": 0.7611374977454458, + "flos": 66496580426880.0, + "grad_norm": 0.8720645697198282, + "language_loss": 0.61830842, + "learning_rate": 5.690867251141576e-07, + "loss": 0.63863277, + "num_input_tokens_seen": 136053450, + "step": 6330, + "time_per_iteration": 3.2151288986206055 + }, + { + "auxiliary_loss_clip": 0.0116113, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.04748785, + "balance_loss_mlp": 1.01826048, + "epoch": 0.7612577406360849, + "flos": 15633136592640.0, + "grad_norm": 2.993612463154022, + "language_loss": 0.9221698, + "learning_rate": 5.685425995585013e-07, + "loss": 0.94403684, + "num_input_tokens_seen": 136071375, + "step": 6331, + "time_per_iteration": 2.42741322517395 + }, + { + "auxiliary_loss_clip": 0.01048877, + "auxiliary_loss_mlp": 0.01004664, + "balance_loss_clip": 1.01247931, + "balance_loss_mlp": 1.0036273, + "epoch": 0.761377983526724, + "flos": 60526253237760.0, + "grad_norm": 3.8826306507364126, + "language_loss": 0.59045696, + "learning_rate": 5.679986911537935e-07, + "loss": 0.61099243, + "num_input_tokens_seen": 136138905, + "step": 6332, + "time_per_iteration": 3.1932616233825684 + }, + { + "auxiliary_loss_clip": 0.0110315, + "auxiliary_loss_mlp": 0.01021479, + "balance_loss_clip": 1.04544961, + "balance_loss_mlp": 1.01418352, + "epoch": 0.7614982264173631, + "flos": 35772522019200.0, + "grad_norm": 2.1024744526939783, + "language_loss": 0.67546231, + "learning_rate": 5.674549999825462e-07, + "loss": 0.69670856, + "num_input_tokens_seen": 136161720, + "step": 6333, + "time_per_iteration": 2.655038833618164 + }, + { + "auxiliary_loss_clip": 0.01058547, + "auxiliary_loss_mlp": 0.010017, + "balance_loss_clip": 1.01239836, + "balance_loss_mlp": 1.00067508, + "epoch": 0.7616184693080021, + "flos": 67925502345600.0, + "grad_norm": 0.9183383795375871, + "language_loss": 0.71333563, + "learning_rate": 5.669115261272363e-07, + "loss": 0.7339381, + "num_input_tokens_seen": 136222040, + "step": 6334, + "time_per_iteration": 3.023500442504883 + }, + { + "auxiliary_loss_clip": 0.01155372, + "auxiliary_loss_mlp": 0.010263, + "balance_loss_clip": 1.04781616, + "balance_loss_mlp": 1.01885581, + "epoch": 0.7617387121986413, + "flos": 20521979141760.0, + "grad_norm": 2.3209834273778966, + "language_loss": 0.72619081, + "learning_rate": 5.663682696703081e-07, + "loss": 0.74800754, + "num_input_tokens_seen": 136240305, + "step": 6335, + "time_per_iteration": 2.424441337585449 + }, + { + "auxiliary_loss_clip": 0.01166859, + "auxiliary_loss_mlp": 0.01022455, + "balance_loss_clip": 1.04935789, + "balance_loss_mlp": 1.01562095, + "epoch": 0.7618589550892804, + "flos": 18624495283200.0, + "grad_norm": 2.13907609455361, + "language_loss": 0.82004869, + "learning_rate": 5.658252306941746e-07, + "loss": 0.84194183, + "num_input_tokens_seen": 136259625, + "step": 6336, + "time_per_iteration": 2.392655372619629 + }, + { + "auxiliary_loss_clip": 0.01113659, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.04392028, + "balance_loss_mlp": 1.02003527, + "epoch": 0.7619791979799194, + "flos": 17453735389440.0, + "grad_norm": 2.0472294080051188, + "language_loss": 0.75158238, + "learning_rate": 5.65282409281212e-07, + "loss": 0.77299917, + "num_input_tokens_seen": 136277090, + "step": 6337, + "time_per_iteration": 4.117333173751831 + }, + { + "auxiliary_loss_clip": 0.01136397, + "auxiliary_loss_mlp": 0.01024597, + "balance_loss_clip": 1.0446856, + "balance_loss_mlp": 1.01742387, + "epoch": 0.7620994408705585, + "flos": 14137421333760.0, + "grad_norm": 2.817679875138117, + "language_loss": 0.70092213, + "learning_rate": 5.64739805513768e-07, + "loss": 0.72253203, + "num_input_tokens_seen": 136294635, + "step": 6338, + "time_per_iteration": 3.2620275020599365 + }, + { + "auxiliary_loss_clip": 0.01053926, + "auxiliary_loss_mlp": 0.00753067, + "balance_loss_clip": 1.01274848, + "balance_loss_mlp": 1.00004756, + "epoch": 0.7622196837611976, + "flos": 70708792527360.0, + "grad_norm": 0.7868444945715333, + "language_loss": 0.55714929, + "learning_rate": 5.641974194741541e-07, + "loss": 0.57521927, + "num_input_tokens_seen": 136350320, + "step": 6339, + "time_per_iteration": 2.9368066787719727 + }, + { + "auxiliary_loss_clip": 0.0104348, + "auxiliary_loss_mlp": 0.01002475, + "balance_loss_clip": 1.02246153, + "balance_loss_mlp": 1.00114536, + "epoch": 0.7623399266518367, + "flos": 60684150447360.0, + "grad_norm": 0.7965721282224352, + "language_loss": 0.63755411, + "learning_rate": 5.636552512446502e-07, + "loss": 0.65801364, + "num_input_tokens_seen": 136411375, + "step": 6340, + "time_per_iteration": 2.9715940952301025 + }, + { + "auxiliary_loss_clip": 0.01147749, + "auxiliary_loss_mlp": 0.0102425, + "balance_loss_clip": 1.04569745, + "balance_loss_mlp": 1.01706481, + "epoch": 0.7624601695424758, + "flos": 26468893641600.0, + "grad_norm": 1.9332538891919064, + "language_loss": 0.77745032, + "learning_rate": 5.631133009075027e-07, + "loss": 0.79917026, + "num_input_tokens_seen": 136430560, + "step": 6341, + "time_per_iteration": 2.4662132263183594 + }, + { + "auxiliary_loss_clip": 0.01154421, + "auxiliary_loss_mlp": 0.00761352, + "balance_loss_clip": 1.04762554, + "balance_loss_mlp": 1.00031853, + "epoch": 0.7625804124331149, + "flos": 19135755515520.0, + "grad_norm": 1.7510602788399283, + "language_loss": 0.68527842, + "learning_rate": 5.625715685449242e-07, + "loss": 0.70443618, + "num_input_tokens_seen": 136448665, + "step": 6342, + "time_per_iteration": 3.2139532566070557 + }, + { + "auxiliary_loss_clip": 0.01128327, + "auxiliary_loss_mlp": 0.01025456, + "balance_loss_clip": 1.05097675, + "balance_loss_mlp": 1.01882803, + "epoch": 0.762700655323754, + "flos": 26213101914240.0, + "grad_norm": 1.629358366918023, + "language_loss": 0.71679193, + "learning_rate": 5.620300542390966e-07, + "loss": 0.73832971, + "num_input_tokens_seen": 136469710, + "step": 6343, + "time_per_iteration": 2.5413007736206055 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01024264, + "balance_loss_clip": 1.0427084, + "balance_loss_mlp": 1.01771009, + "epoch": 0.762820898214393, + "flos": 22382582711040.0, + "grad_norm": 1.9192033933468766, + "language_loss": 0.849204, + "learning_rate": 5.614887580721659e-07, + "loss": 0.87080508, + "num_input_tokens_seen": 136489855, + "step": 6344, + "time_per_iteration": 2.4717347621917725 + }, + { + "auxiliary_loss_clip": 0.01117947, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.04618454, + "balance_loss_mlp": 1.02439153, + "epoch": 0.7629411411050322, + "flos": 15700504550400.0, + "grad_norm": 1.9564479352594817, + "language_loss": 0.73763824, + "learning_rate": 5.609476801262481e-07, + "loss": 0.75914013, + "num_input_tokens_seen": 136504715, + "step": 6345, + "time_per_iteration": 2.42869234085083 + }, + { + "auxiliary_loss_clip": 0.01126714, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.04847682, + "balance_loss_mlp": 1.01982534, + "epoch": 0.7630613839956712, + "flos": 13770342293760.0, + "grad_norm": 2.292100234323005, + "language_loss": 0.63951814, + "learning_rate": 5.604068204834223e-07, + "loss": 0.66105622, + "num_input_tokens_seen": 136521610, + "step": 6346, + "time_per_iteration": 2.4694647789001465 + }, + { + "auxiliary_loss_clip": 0.01111715, + "auxiliary_loss_mlp": 0.00762354, + "balance_loss_clip": 1.04542756, + "balance_loss_mlp": 1.000283, + "epoch": 0.7631816268863103, + "flos": 14569569861120.0, + "grad_norm": 2.215049450923147, + "language_loss": 0.76720798, + "learning_rate": 5.598661792257367e-07, + "loss": 0.78594863, + "num_input_tokens_seen": 136538655, + "step": 6347, + "time_per_iteration": 2.509542942047119 + }, + { + "auxiliary_loss_clip": 0.01150963, + "auxiliary_loss_mlp": 0.01024132, + "balance_loss_clip": 1.04513907, + "balance_loss_mlp": 1.01718175, + "epoch": 0.7633018697769495, + "flos": 19062210418560.0, + "grad_norm": 1.9012677857481546, + "language_loss": 0.75694811, + "learning_rate": 5.593257564352071e-07, + "loss": 0.77869904, + "num_input_tokens_seen": 136557095, + "step": 6348, + "time_per_iteration": 2.4221768379211426 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.0102128, + "balance_loss_clip": 1.047261, + "balance_loss_mlp": 1.01425278, + "epoch": 0.7634221126675885, + "flos": 22052958577920.0, + "grad_norm": 1.546631299463628, + "language_loss": 0.75615633, + "learning_rate": 5.58785552193815e-07, + "loss": 0.77788568, + "num_input_tokens_seen": 136577340, + "step": 6349, + "time_per_iteration": 2.4384586811065674 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01022205, + "balance_loss_clip": 1.04902315, + "balance_loss_mlp": 1.01530015, + "epoch": 0.7635423555582276, + "flos": 29382720825600.0, + "grad_norm": 2.1326665327485106, + "language_loss": 0.75637043, + "learning_rate": 5.582455665835086e-07, + "loss": 0.77827775, + "num_input_tokens_seen": 136597635, + "step": 6350, + "time_per_iteration": 2.4667088985443115 + }, + { + "auxiliary_loss_clip": 0.01150137, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.04547262, + "balance_loss_mlp": 1.02212048, + "epoch": 0.7636625984488667, + "flos": 17784903807360.0, + "grad_norm": 2.596099468780283, + "language_loss": 0.72555166, + "learning_rate": 5.577057996862036e-07, + "loss": 0.7473563, + "num_input_tokens_seen": 136615260, + "step": 6351, + "time_per_iteration": 2.4519476890563965 + }, + { + "auxiliary_loss_clip": 0.01163613, + "auxiliary_loss_mlp": 0.01024034, + "balance_loss_clip": 1.04783523, + "balance_loss_mlp": 1.01721549, + "epoch": 0.7637828413395058, + "flos": 23734583654400.0, + "grad_norm": 1.6057167603519746, + "language_loss": 0.76335788, + "learning_rate": 5.571662515837814e-07, + "loss": 0.78523433, + "num_input_tokens_seen": 136637220, + "step": 6352, + "time_per_iteration": 2.4916059970855713 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01024031, + "balance_loss_clip": 1.04600549, + "balance_loss_mlp": 1.01711392, + "epoch": 0.7639030842301449, + "flos": 36283279461120.0, + "grad_norm": 1.873407667167574, + "language_loss": 0.83926988, + "learning_rate": 5.566269223580926e-07, + "loss": 0.86089158, + "num_input_tokens_seen": 136658930, + "step": 6353, + "time_per_iteration": 2.601808786392212 + }, + { + "auxiliary_loss_clip": 0.01156984, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.04849374, + "balance_loss_mlp": 1.01662576, + "epoch": 0.764023327120784, + "flos": 28878104609280.0, + "grad_norm": 1.8511146631553665, + "language_loss": 0.75109136, + "learning_rate": 5.560878120909511e-07, + "loss": 0.77290016, + "num_input_tokens_seen": 136681530, + "step": 6354, + "time_per_iteration": 2.5004525184631348 + }, + { + "auxiliary_loss_clip": 0.01059698, + "auxiliary_loss_mlp": 0.01002478, + "balance_loss_clip": 1.01318908, + "balance_loss_mlp": 1.0014708, + "epoch": 0.7641435700114231, + "flos": 64789711067520.0, + "grad_norm": 0.8441767138699058, + "language_loss": 0.58611226, + "learning_rate": 5.55548920864141e-07, + "loss": 0.60673404, + "num_input_tokens_seen": 136742185, + "step": 6355, + "time_per_iteration": 3.04837703704834 + }, + { + "auxiliary_loss_clip": 0.01155418, + "auxiliary_loss_mlp": 0.01020966, + "balance_loss_clip": 1.05129814, + "balance_loss_mlp": 1.01443887, + "epoch": 0.7642638129020621, + "flos": 16835784785280.0, + "grad_norm": 1.8017246285669568, + "language_loss": 0.77789485, + "learning_rate": 5.550102487594113e-07, + "loss": 0.79965872, + "num_input_tokens_seen": 136760855, + "step": 6356, + "time_per_iteration": 2.4332199096679688 + }, + { + "auxiliary_loss_clip": 0.01114587, + "auxiliary_loss_mlp": 0.00761259, + "balance_loss_clip": 1.04167366, + "balance_loss_mlp": 1.0003233, + "epoch": 0.7643840557927013, + "flos": 30408940391040.0, + "grad_norm": 1.8794261144617133, + "language_loss": 0.71704465, + "learning_rate": 5.54471795858477e-07, + "loss": 0.73580307, + "num_input_tokens_seen": 136780925, + "step": 6357, + "time_per_iteration": 2.609037160873413 + }, + { + "auxiliary_loss_clip": 0.01125005, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.04139495, + "balance_loss_mlp": 1.02074564, + "epoch": 0.7645042986833404, + "flos": 16983234115200.0, + "grad_norm": 1.9632775658355854, + "language_loss": 0.82754779, + "learning_rate": 5.539335622430235e-07, + "loss": 0.84907901, + "num_input_tokens_seen": 136799545, + "step": 6358, + "time_per_iteration": 2.483186960220337 + }, + { + "auxiliary_loss_clip": 0.01146309, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.04414415, + "balance_loss_mlp": 1.01919866, + "epoch": 0.7646245415739794, + "flos": 17311493531520.0, + "grad_norm": 2.0609308329142393, + "language_loss": 0.74821275, + "learning_rate": 5.533955479946975e-07, + "loss": 0.76994491, + "num_input_tokens_seen": 136818325, + "step": 6359, + "time_per_iteration": 2.4492714405059814 + }, + { + "auxiliary_loss_clip": 0.01034095, + "auxiliary_loss_mlp": 0.00753077, + "balance_loss_clip": 1.02201557, + "balance_loss_mlp": 0.99990511, + "epoch": 0.7647447844646186, + "flos": 70402332666240.0, + "grad_norm": 0.8732725267242816, + "language_loss": 0.65782553, + "learning_rate": 5.528577531951173e-07, + "loss": 0.67569721, + "num_input_tokens_seen": 136878730, + "step": 6360, + "time_per_iteration": 3.057543992996216 + }, + { + "auxiliary_loss_clip": 0.01144509, + "auxiliary_loss_mlp": 0.01022368, + "balance_loss_clip": 1.04717374, + "balance_loss_mlp": 1.01574552, + "epoch": 0.7648650273552576, + "flos": 17675914965120.0, + "grad_norm": 2.466310627002327, + "language_loss": 0.73924428, + "learning_rate": 5.523201779258653e-07, + "loss": 0.76091301, + "num_input_tokens_seen": 136897705, + "step": 6361, + "time_per_iteration": 2.45358943939209 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01023562, + "balance_loss_clip": 1.04690981, + "balance_loss_mlp": 1.0161891, + "epoch": 0.7649852702458967, + "flos": 22162019247360.0, + "grad_norm": 1.8628793001611974, + "language_loss": 0.84109479, + "learning_rate": 5.517828222684912e-07, + "loss": 0.86298788, + "num_input_tokens_seen": 136918360, + "step": 6362, + "time_per_iteration": 2.4129037857055664 + }, + { + "auxiliary_loss_clip": 0.0104627, + "auxiliary_loss_mlp": 0.01001362, + "balance_loss_clip": 1.01423407, + "balance_loss_mlp": 1.0002774, + "epoch": 0.7651055131365359, + "flos": 69848338227840.0, + "grad_norm": 0.765834522646588, + "language_loss": 0.59067565, + "learning_rate": 5.512456863045117e-07, + "loss": 0.61115199, + "num_input_tokens_seen": 136979050, + "step": 6363, + "time_per_iteration": 3.871473789215088 + }, + { + "auxiliary_loss_clip": 0.01166817, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.04681683, + "balance_loss_mlp": 1.01928055, + "epoch": 0.7652257560271749, + "flos": 19464014931840.0, + "grad_norm": 2.3135936498083147, + "language_loss": 0.74049056, + "learning_rate": 5.507087701154089e-07, + "loss": 0.76242316, + "num_input_tokens_seen": 136998970, + "step": 6364, + "time_per_iteration": 3.1764132976531982 + }, + { + "auxiliary_loss_clip": 0.01112698, + "auxiliary_loss_mlp": 0.01026497, + "balance_loss_clip": 1.04280519, + "balance_loss_mlp": 1.01966918, + "epoch": 0.765345998917814, + "flos": 15961108700160.0, + "grad_norm": 1.9229534114550089, + "language_loss": 0.75533187, + "learning_rate": 5.50172073782634e-07, + "loss": 0.7767238, + "num_input_tokens_seen": 137016950, + "step": 6365, + "time_per_iteration": 3.3357675075531006 + }, + { + "auxiliary_loss_clip": 0.01123193, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.04585505, + "balance_loss_mlp": 1.02054691, + "epoch": 0.7654662418084531, + "flos": 23659853408640.0, + "grad_norm": 4.170047747101967, + "language_loss": 0.87785631, + "learning_rate": 5.496355973876023e-07, + "loss": 0.89936483, + "num_input_tokens_seen": 137036205, + "step": 6366, + "time_per_iteration": 2.5032167434692383 + }, + { + "auxiliary_loss_clip": 0.01121971, + "auxiliary_loss_mlp": 0.00762334, + "balance_loss_clip": 1.043329, + "balance_loss_mlp": 1.0003792, + "epoch": 0.7655864846990922, + "flos": 41463608878080.0, + "grad_norm": 21.05141699631102, + "language_loss": 0.70696187, + "learning_rate": 5.490993410116984e-07, + "loss": 0.72580487, + "num_input_tokens_seen": 137059195, + "step": 6367, + "time_per_iteration": 2.689823627471924 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.04692626, + "balance_loss_mlp": 1.02109945, + "epoch": 0.7657067275897312, + "flos": 43142684088960.0, + "grad_norm": 1.731143379743734, + "language_loss": 0.69647586, + "learning_rate": 5.485633047362704e-07, + "loss": 0.71798825, + "num_input_tokens_seen": 137081200, + "step": 6368, + "time_per_iteration": 2.697878837585449 + }, + { + "auxiliary_loss_clip": 0.01174076, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.05254412, + "balance_loss_mlp": 1.02226305, + "epoch": 0.7658269704803703, + "flos": 17311780840320.0, + "grad_norm": 5.347963610685303, + "language_loss": 0.78490973, + "learning_rate": 5.480274886426341e-07, + "loss": 0.80694783, + "num_input_tokens_seen": 137097840, + "step": 6369, + "time_per_iteration": 3.1596245765686035 + }, + { + "auxiliary_loss_clip": 0.01151102, + "auxiliary_loss_mlp": 0.01022818, + "balance_loss_clip": 1.04919469, + "balance_loss_mlp": 1.01610088, + "epoch": 0.7659472133710095, + "flos": 12568160977920.0, + "grad_norm": 1.9345798010231363, + "language_loss": 0.77938432, + "learning_rate": 5.474918928120744e-07, + "loss": 0.8011235, + "num_input_tokens_seen": 137114335, + "step": 6370, + "time_per_iteration": 2.4045112133026123 + }, + { + "auxiliary_loss_clip": 0.0115023, + "auxiliary_loss_mlp": 0.01021179, + "balance_loss_clip": 1.0463264, + "balance_loss_mlp": 1.01470876, + "epoch": 0.7660674562616485, + "flos": 22707430335360.0, + "grad_norm": 1.9443593274740107, + "language_loss": 0.87434733, + "learning_rate": 5.469565173258392e-07, + "loss": 0.89606148, + "num_input_tokens_seen": 137132850, + "step": 6371, + "time_per_iteration": 2.441770076751709 + }, + { + "auxiliary_loss_clip": 0.01170854, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.04856074, + "balance_loss_mlp": 1.02049899, + "epoch": 0.7661876991522876, + "flos": 17056455989760.0, + "grad_norm": 1.7071123412324896, + "language_loss": 0.63544959, + "learning_rate": 5.464213622651454e-07, + "loss": 0.65743947, + "num_input_tokens_seen": 137150665, + "step": 6372, + "time_per_iteration": 2.415595531463623 + }, + { + "auxiliary_loss_clip": 0.01131167, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.04567599, + "balance_loss_mlp": 1.01888835, + "epoch": 0.7663079420429267, + "flos": 20084228092800.0, + "grad_norm": 1.6241856462781512, + "language_loss": 0.84267974, + "learning_rate": 5.458864277111753e-07, + "loss": 0.86425221, + "num_input_tokens_seen": 137168500, + "step": 6373, + "time_per_iteration": 2.4927902221679688 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.0076095, + "balance_loss_clip": 1.04256582, + "balance_loss_mlp": 1.00027359, + "epoch": 0.7664281849335658, + "flos": 12677473042560.0, + "grad_norm": 2.3540503747733594, + "language_loss": 0.69292229, + "learning_rate": 5.453517137450769e-07, + "loss": 0.71184337, + "num_input_tokens_seen": 137185075, + "step": 6374, + "time_per_iteration": 2.4372057914733887 + }, + { + "auxiliary_loss_clip": 0.01152503, + "auxiliary_loss_mlp": 0.01025327, + "balance_loss_clip": 1.04839587, + "balance_loss_mlp": 1.01791263, + "epoch": 0.7665484278242048, + "flos": 22345271458560.0, + "grad_norm": 2.0147409502210394, + "language_loss": 0.75881517, + "learning_rate": 5.448172204479684e-07, + "loss": 0.78059345, + "num_input_tokens_seen": 137204355, + "step": 6375, + "time_per_iteration": 2.4697751998901367 + }, + { + "auxiliary_loss_clip": 0.01164884, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.04755449, + "balance_loss_mlp": 1.01797748, + "epoch": 0.766668670714844, + "flos": 23617909301760.0, + "grad_norm": 1.9338781180831894, + "language_loss": 0.7459054, + "learning_rate": 5.442829479009294e-07, + "loss": 0.76780474, + "num_input_tokens_seen": 137223135, + "step": 6376, + "time_per_iteration": 2.409158945083618 + }, + { + "auxiliary_loss_clip": 0.01159459, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.04668212, + "balance_loss_mlp": 1.01988459, + "epoch": 0.7667889136054831, + "flos": 19427134642560.0, + "grad_norm": 2.392654489155968, + "language_loss": 0.71623802, + "learning_rate": 5.437488961850103e-07, + "loss": 0.73810834, + "num_input_tokens_seen": 137242935, + "step": 6377, + "time_per_iteration": 2.4200100898742676 + }, + { + "auxiliary_loss_clip": 0.01106178, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.04220891, + "balance_loss_mlp": 1.01852059, + "epoch": 0.7669091564961221, + "flos": 26866352609280.0, + "grad_norm": 1.7484663900688924, + "language_loss": 0.75362182, + "learning_rate": 5.432150653812258e-07, + "loss": 0.77493268, + "num_input_tokens_seen": 137262970, + "step": 6378, + "time_per_iteration": 2.573922872543335 + }, + { + "auxiliary_loss_clip": 0.01150978, + "auxiliary_loss_mlp": 0.01025203, + "balance_loss_clip": 1.04763174, + "balance_loss_mlp": 1.01786304, + "epoch": 0.7670293993867613, + "flos": 12385303816320.0, + "grad_norm": 2.564257105109169, + "language_loss": 0.82675385, + "learning_rate": 5.42681455570557e-07, + "loss": 0.84851563, + "num_input_tokens_seen": 137279500, + "step": 6379, + "time_per_iteration": 2.4332940578460693 + }, + { + "auxiliary_loss_clip": 0.01163439, + "auxiliary_loss_mlp": 0.01022548, + "balance_loss_clip": 1.04626977, + "balance_loss_mlp": 1.01560998, + "epoch": 0.7671496422774003, + "flos": 21762944167680.0, + "grad_norm": 1.991398931282379, + "language_loss": 0.64638889, + "learning_rate": 5.42148066833954e-07, + "loss": 0.66824877, + "num_input_tokens_seen": 137298745, + "step": 6380, + "time_per_iteration": 2.394430637359619 + }, + { + "auxiliary_loss_clip": 0.01165104, + "auxiliary_loss_mlp": 0.01024203, + "balance_loss_clip": 1.04786932, + "balance_loss_mlp": 1.01716423, + "epoch": 0.7672698851680394, + "flos": 21069221823360.0, + "grad_norm": 2.2928881409693727, + "language_loss": 0.75220078, + "learning_rate": 5.416148992523289e-07, + "loss": 0.77409387, + "num_input_tokens_seen": 137317320, + "step": 6381, + "time_per_iteration": 2.3966023921966553 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.0407145, + "balance_loss_mlp": 1.01870561, + "epoch": 0.7673901280586786, + "flos": 16976697840000.0, + "grad_norm": 2.3913270229965367, + "language_loss": 0.784832, + "learning_rate": 5.410819529065644e-07, + "loss": 0.80590546, + "num_input_tokens_seen": 137335275, + "step": 6382, + "time_per_iteration": 2.564643144607544 + }, + { + "auxiliary_loss_clip": 0.01110222, + "auxiliary_loss_mlp": 0.01022767, + "balance_loss_clip": 1.04181111, + "balance_loss_mlp": 1.0159874, + "epoch": 0.7675103709493176, + "flos": 29242669697280.0, + "grad_norm": 2.0581881229278545, + "language_loss": 0.65522915, + "learning_rate": 5.405492278775079e-07, + "loss": 0.67655903, + "num_input_tokens_seen": 137355055, + "step": 6383, + "time_per_iteration": 2.597848415374756 + }, + { + "auxiliary_loss_clip": 0.01139524, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.04418361, + "balance_loss_mlp": 1.01903796, + "epoch": 0.7676306138399567, + "flos": 29023004073600.0, + "grad_norm": 2.4440017688204447, + "language_loss": 0.79515409, + "learning_rate": 5.400167242459732e-07, + "loss": 0.81681311, + "num_input_tokens_seen": 137374015, + "step": 6384, + "time_per_iteration": 2.5313050746917725 + }, + { + "auxiliary_loss_clip": 0.01150861, + "auxiliary_loss_mlp": 0.01027705, + "balance_loss_clip": 1.04624724, + "balance_loss_mlp": 1.02072835, + "epoch": 0.7677508567305958, + "flos": 22565116650240.0, + "grad_norm": 1.7362755035977755, + "language_loss": 0.80614436, + "learning_rate": 5.394844420927405e-07, + "loss": 0.82793003, + "num_input_tokens_seen": 137393625, + "step": 6385, + "time_per_iteration": 2.4353082180023193 + }, + { + "auxiliary_loss_clip": 0.01165397, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.04734337, + "balance_loss_mlp": 1.02205706, + "epoch": 0.7678710996212349, + "flos": 25411432222080.0, + "grad_norm": 2.024455177025597, + "language_loss": 0.73396903, + "learning_rate": 5.389523814985562e-07, + "loss": 0.75591576, + "num_input_tokens_seen": 137413045, + "step": 6386, + "time_per_iteration": 2.427029848098755 + }, + { + "auxiliary_loss_clip": 0.01111857, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.043805, + "balance_loss_mlp": 1.01593113, + "epoch": 0.767991342511874, + "flos": 26756825063040.0, + "grad_norm": 2.589964151125589, + "language_loss": 0.76044774, + "learning_rate": 5.384205425441344e-07, + "loss": 0.78179914, + "num_input_tokens_seen": 137433955, + "step": 6387, + "time_per_iteration": 2.579573631286621 + }, + { + "auxiliary_loss_clip": 0.01139821, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_clip": 1.04385543, + "balance_loss_mlp": 1.01521337, + "epoch": 0.7681115854025131, + "flos": 26359509749760.0, + "grad_norm": 1.8126868902802877, + "language_loss": 0.8405416, + "learning_rate": 5.378889253101537e-07, + "loss": 0.86216235, + "num_input_tokens_seen": 137454510, + "step": 6388, + "time_per_iteration": 2.5149872303009033 + }, + { + "auxiliary_loss_clip": 0.01153351, + "auxiliary_loss_mlp": 0.01022075, + "balance_loss_clip": 1.04650307, + "balance_loss_mlp": 1.01528931, + "epoch": 0.7682318282931522, + "flos": 23257043314560.0, + "grad_norm": 1.6333737764074625, + "language_loss": 0.81280822, + "learning_rate": 5.373575298772617e-07, + "loss": 0.83456242, + "num_input_tokens_seen": 137473630, + "step": 6389, + "time_per_iteration": 2.4467570781707764 + }, + { + "auxiliary_loss_clip": 0.01059106, + "auxiliary_loss_mlp": 0.01001136, + "balance_loss_clip": 1.01214981, + "balance_loss_mlp": 1.00000322, + "epoch": 0.7683520711837912, + "flos": 70072457137920.0, + "grad_norm": 0.7662733122752389, + "language_loss": 0.61310136, + "learning_rate": 5.368263563260689e-07, + "loss": 0.63370383, + "num_input_tokens_seen": 137538765, + "step": 6390, + "time_per_iteration": 3.9423651695251465 + }, + { + "auxiliary_loss_clip": 0.01153938, + "auxiliary_loss_mlp": 0.01022762, + "balance_loss_clip": 1.04655004, + "balance_loss_mlp": 1.01564562, + "epoch": 0.7684723140744304, + "flos": 18624890332800.0, + "grad_norm": 1.4744298952076518, + "language_loss": 0.64075053, + "learning_rate": 5.362954047371537e-07, + "loss": 0.66251755, + "num_input_tokens_seen": 137557875, + "step": 6391, + "time_per_iteration": 4.062402009963989 + }, + { + "auxiliary_loss_clip": 0.01128922, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.05053234, + "balance_loss_mlp": 1.01964617, + "epoch": 0.7685925569650695, + "flos": 27452989532160.0, + "grad_norm": 2.862261316174651, + "language_loss": 0.72164237, + "learning_rate": 5.357646751910627e-07, + "loss": 0.74320257, + "num_input_tokens_seen": 137579055, + "step": 6392, + "time_per_iteration": 2.5438971519470215 + }, + { + "auxiliary_loss_clip": 0.0113704, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.04458559, + "balance_loss_mlp": 1.02365828, + "epoch": 0.7687127998557085, + "flos": 24535714642560.0, + "grad_norm": 2.5933805690484544, + "language_loss": 0.79862297, + "learning_rate": 5.352341677683061e-07, + "loss": 0.8203041, + "num_input_tokens_seen": 137600355, + "step": 6393, + "time_per_iteration": 2.5085439682006836 + }, + { + "auxiliary_loss_clip": 0.01132052, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.04546452, + "balance_loss_mlp": 1.02042115, + "epoch": 0.7688330427463477, + "flos": 25155963717120.0, + "grad_norm": 1.8067173779569732, + "language_loss": 0.7894783, + "learning_rate": 5.347038825493617e-07, + "loss": 0.81107688, + "num_input_tokens_seen": 137621885, + "step": 6394, + "time_per_iteration": 2.5423941612243652 + }, + { + "auxiliary_loss_clip": 0.01136182, + "auxiliary_loss_mlp": 0.01024991, + "balance_loss_clip": 1.04805613, + "balance_loss_mlp": 1.01808882, + "epoch": 0.7689532856369867, + "flos": 21211284113280.0, + "grad_norm": 2.348601471180008, + "language_loss": 0.68609875, + "learning_rate": 5.341738196146732e-07, + "loss": 0.7077105, + "num_input_tokens_seen": 137640230, + "step": 6395, + "time_per_iteration": 2.462198257446289 + }, + { + "auxiliary_loss_clip": 0.01148984, + "auxiliary_loss_mlp": 0.01022085, + "balance_loss_clip": 1.04504156, + "balance_loss_mlp": 1.01493239, + "epoch": 0.7690735285276258, + "flos": 25119083427840.0, + "grad_norm": 2.3378419669585626, + "language_loss": 0.73600239, + "learning_rate": 5.336439790446503e-07, + "loss": 0.75771308, + "num_input_tokens_seen": 137659330, + "step": 6396, + "time_per_iteration": 3.214081048965454 + }, + { + "auxiliary_loss_clip": 0.01119193, + "auxiliary_loss_mlp": 0.01026499, + "balance_loss_clip": 1.04023659, + "balance_loss_mlp": 1.01849461, + "epoch": 0.769193771418265, + "flos": 54744020640000.0, + "grad_norm": 1.7038441912025504, + "language_loss": 0.62881178, + "learning_rate": 5.331143609196711e-07, + "loss": 0.65026867, + "num_input_tokens_seen": 137683145, + "step": 6397, + "time_per_iteration": 2.8088443279266357 + }, + { + "auxiliary_loss_clip": 0.01153538, + "auxiliary_loss_mlp": 0.01024576, + "balance_loss_clip": 1.04888165, + "balance_loss_mlp": 1.01738739, + "epoch": 0.769314014308904, + "flos": 37341890115840.0, + "grad_norm": 1.6713104968828993, + "language_loss": 0.76808703, + "learning_rate": 5.325849653200758e-07, + "loss": 0.78986812, + "num_input_tokens_seen": 137707095, + "step": 6398, + "time_per_iteration": 2.5601727962493896 + }, + { + "auxiliary_loss_clip": 0.0116715, + "auxiliary_loss_mlp": 0.01023884, + "balance_loss_clip": 1.0490346, + "balance_loss_mlp": 1.01664782, + "epoch": 0.7694342571995431, + "flos": 20631686256000.0, + "grad_norm": 1.6415384007702871, + "language_loss": 0.76343262, + "learning_rate": 5.32055792326175e-07, + "loss": 0.78534299, + "num_input_tokens_seen": 137725520, + "step": 6399, + "time_per_iteration": 2.4432456493377686 + }, + { + "auxiliary_loss_clip": 0.01141757, + "auxiliary_loss_mlp": 0.01021077, + "balance_loss_clip": 1.04749393, + "balance_loss_mlp": 1.01371002, + "epoch": 0.7695545000901821, + "flos": 24207706621440.0, + "grad_norm": 2.3435896626514996, + "language_loss": 0.73139322, + "learning_rate": 5.315268420182437e-07, + "loss": 0.75302148, + "num_input_tokens_seen": 137744195, + "step": 6400, + "time_per_iteration": 2.4782979488372803 + }, + { + "auxiliary_loss_clip": 0.01129143, + "auxiliary_loss_mlp": 0.00761597, + "balance_loss_clip": 1.04450679, + "balance_loss_mlp": 1.00037694, + "epoch": 0.7696747429808213, + "flos": 28001273708160.0, + "grad_norm": 1.7500450904575275, + "language_loss": 0.76619947, + "learning_rate": 5.309981144765221e-07, + "loss": 0.7851069, + "num_input_tokens_seen": 137764340, + "step": 6401, + "time_per_iteration": 2.6107680797576904 + }, + { + "auxiliary_loss_clip": 0.01115922, + "auxiliary_loss_mlp": 0.01020622, + "balance_loss_clip": 1.04253924, + "balance_loss_mlp": 1.01420283, + "epoch": 0.7697949858714603, + "flos": 11509550323200.0, + "grad_norm": 3.108236240830104, + "language_loss": 0.75308383, + "learning_rate": 5.304696097812196e-07, + "loss": 0.77444929, + "num_input_tokens_seen": 137780940, + "step": 6402, + "time_per_iteration": 2.5105855464935303 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.04330397, + "balance_loss_mlp": 1.02156341, + "epoch": 0.7699152287620994, + "flos": 26688271956480.0, + "grad_norm": 3.265144608985128, + "language_loss": 0.60605782, + "learning_rate": 5.299413280125078e-07, + "loss": 0.62771046, + "num_input_tokens_seen": 137799250, + "step": 6403, + "time_per_iteration": 2.5085442066192627 + }, + { + "auxiliary_loss_clip": 0.01139339, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.04530358, + "balance_loss_mlp": 1.02209449, + "epoch": 0.7700354716527386, + "flos": 16544944362240.0, + "grad_norm": 2.181327075446862, + "language_loss": 0.72531736, + "learning_rate": 5.294132692505284e-07, + "loss": 0.74700254, + "num_input_tokens_seen": 137817660, + "step": 6404, + "time_per_iteration": 2.466015338897705 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.04092407, + "balance_loss_mlp": 1.01808906, + "epoch": 0.7701557145433776, + "flos": 19242733196160.0, + "grad_norm": 1.8960630050847007, + "language_loss": 0.79147637, + "learning_rate": 5.288854335753861e-07, + "loss": 0.81275332, + "num_input_tokens_seen": 137835920, + "step": 6405, + "time_per_iteration": 2.5346932411193848 + }, + { + "auxiliary_loss_clip": 0.01152231, + "auxiliary_loss_mlp": 0.01020601, + "balance_loss_clip": 1.044837, + "balance_loss_mlp": 1.01351082, + "epoch": 0.7702759574340167, + "flos": 31685744211840.0, + "grad_norm": 2.386096919592745, + "language_loss": 0.75552845, + "learning_rate": 5.283578210671551e-07, + "loss": 0.77725673, + "num_input_tokens_seen": 137858160, + "step": 6406, + "time_per_iteration": 2.52358078956604 + }, + { + "auxiliary_loss_clip": 0.01141899, + "auxiliary_loss_mlp": 0.01021041, + "balance_loss_clip": 1.04589963, + "balance_loss_mlp": 1.01400757, + "epoch": 0.7703962003246558, + "flos": 16800089644800.0, + "grad_norm": 1.9971462237839832, + "language_loss": 0.76459974, + "learning_rate": 5.278304318058719e-07, + "loss": 0.78622913, + "num_input_tokens_seen": 137876015, + "step": 6407, + "time_per_iteration": 2.442455768585205 + }, + { + "auxiliary_loss_clip": 0.01097723, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.04268956, + "balance_loss_mlp": 1.01976061, + "epoch": 0.7705164432152949, + "flos": 35736072693120.0, + "grad_norm": 1.9523814734497202, + "language_loss": 0.79077709, + "learning_rate": 5.273032658715411e-07, + "loss": 0.81202972, + "num_input_tokens_seen": 137898825, + "step": 6408, + "time_per_iteration": 2.6898953914642334 + }, + { + "auxiliary_loss_clip": 0.01107563, + "auxiliary_loss_mlp": 0.01023865, + "balance_loss_clip": 1.04177475, + "balance_loss_mlp": 1.01662326, + "epoch": 0.7706366861059339, + "flos": 23365960329600.0, + "grad_norm": 2.030378492407595, + "language_loss": 0.76631641, + "learning_rate": 5.267763233441347e-07, + "loss": 0.7876308, + "num_input_tokens_seen": 137919455, + "step": 6409, + "time_per_iteration": 2.551865339279175 + }, + { + "auxiliary_loss_clip": 0.0115581, + "auxiliary_loss_mlp": 0.01022356, + "balance_loss_clip": 1.04747939, + "balance_loss_mlp": 1.01460743, + "epoch": 0.7707569289965731, + "flos": 22929897219840.0, + "grad_norm": 3.1573725336824485, + "language_loss": 0.6991812, + "learning_rate": 5.26249604303588e-07, + "loss": 0.72096288, + "num_input_tokens_seen": 137937960, + "step": 6410, + "time_per_iteration": 2.443208694458008 + }, + { + "auxiliary_loss_clip": 0.01166478, + "auxiliary_loss_mlp": 0.01025082, + "balance_loss_clip": 1.04868722, + "balance_loss_mlp": 1.01802516, + "epoch": 0.7708771718872122, + "flos": 17420661941760.0, + "grad_norm": 2.1389885977442047, + "language_loss": 0.78599912, + "learning_rate": 5.257231088298057e-07, + "loss": 0.80791473, + "num_input_tokens_seen": 137956370, + "step": 6411, + "time_per_iteration": 2.3761322498321533 + }, + { + "auxiliary_loss_clip": 0.01032962, + "auxiliary_loss_mlp": 0.01001088, + "balance_loss_clip": 1.01138186, + "balance_loss_mlp": 0.99999762, + "epoch": 0.7709974147778512, + "flos": 72241316248320.0, + "grad_norm": 0.7985975284489502, + "language_loss": 0.53935266, + "learning_rate": 5.25196837002655e-07, + "loss": 0.55969322, + "num_input_tokens_seen": 138016080, + "step": 6412, + "time_per_iteration": 3.1264119148254395 + }, + { + "auxiliary_loss_clip": 0.01136337, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.04465556, + "balance_loss_mlp": 1.0263474, + "epoch": 0.7711176576684904, + "flos": 39859694876160.0, + "grad_norm": 2.687812407898376, + "language_loss": 0.68508303, + "learning_rate": 5.24670788901971e-07, + "loss": 0.70678473, + "num_input_tokens_seen": 138039170, + "step": 6413, + "time_per_iteration": 2.6299617290496826 + }, + { + "auxiliary_loss_clip": 0.01138367, + "auxiliary_loss_mlp": 0.01026295, + "balance_loss_clip": 1.04648781, + "balance_loss_mlp": 1.01796854, + "epoch": 0.7712379005591294, + "flos": 36976391274240.0, + "grad_norm": 2.625574801016011, + "language_loss": 0.68696755, + "learning_rate": 5.241449646075557e-07, + "loss": 0.70861423, + "num_input_tokens_seen": 138062395, + "step": 6414, + "time_per_iteration": 2.5986409187316895 + }, + { + "auxiliary_loss_clip": 0.01161972, + "auxiliary_loss_mlp": 0.01026753, + "balance_loss_clip": 1.0483228, + "balance_loss_mlp": 1.0195111, + "epoch": 0.7713581434497685, + "flos": 22776773541120.0, + "grad_norm": 2.105796709133199, + "language_loss": 0.72661221, + "learning_rate": 5.236193641991762e-07, + "loss": 0.74849951, + "num_input_tokens_seen": 138080325, + "step": 6415, + "time_per_iteration": 2.4423716068267822 + }, + { + "auxiliary_loss_clip": 0.01137557, + "auxiliary_loss_mlp": 0.01024376, + "balance_loss_clip": 1.04565716, + "balance_loss_mlp": 1.01741719, + "epoch": 0.7714783863404077, + "flos": 24097460803200.0, + "grad_norm": 2.363846756376933, + "language_loss": 0.70127535, + "learning_rate": 5.23093987756565e-07, + "loss": 0.72289467, + "num_input_tokens_seen": 138099020, + "step": 6416, + "time_per_iteration": 3.344449281692505 + }, + { + "auxiliary_loss_clip": 0.01128316, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.04139125, + "balance_loss_mlp": 1.019104, + "epoch": 0.7715986292310467, + "flos": 21063655215360.0, + "grad_norm": 1.8919636641419346, + "language_loss": 0.75455183, + "learning_rate": 5.225688353594217e-07, + "loss": 0.77610105, + "num_input_tokens_seen": 138118650, + "step": 6417, + "time_per_iteration": 3.261704683303833 + }, + { + "auxiliary_loss_clip": 0.01143051, + "auxiliary_loss_mlp": 0.0076162, + "balance_loss_clip": 1.04709172, + "balance_loss_mlp": 1.0003562, + "epoch": 0.7717188721216858, + "flos": 20594877793920.0, + "grad_norm": 2.5151830443973946, + "language_loss": 0.77783799, + "learning_rate": 5.220439070874108e-07, + "loss": 0.79688466, + "num_input_tokens_seen": 138137890, + "step": 6418, + "time_per_iteration": 3.306267738342285 + }, + { + "auxiliary_loss_clip": 0.01153546, + "auxiliary_loss_mlp": 0.01026658, + "balance_loss_clip": 1.04918706, + "balance_loss_mlp": 1.01971447, + "epoch": 0.7718391150123249, + "flos": 26250951870720.0, + "grad_norm": 1.8273201541943596, + "language_loss": 0.71487725, + "learning_rate": 5.215192030201652e-07, + "loss": 0.73667926, + "num_input_tokens_seen": 138158880, + "step": 6419, + "time_per_iteration": 2.4934487342834473 + }, + { + "auxiliary_loss_clip": 0.01110969, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.03986096, + "balance_loss_mlp": 1.01911354, + "epoch": 0.771959357902964, + "flos": 22049762267520.0, + "grad_norm": 2.124705495135433, + "language_loss": 0.86177957, + "learning_rate": 5.209947232372798e-07, + "loss": 0.88315308, + "num_input_tokens_seen": 138176370, + "step": 6420, + "time_per_iteration": 2.4981281757354736 + }, + { + "auxiliary_loss_clip": 0.01154832, + "auxiliary_loss_mlp": 0.00761693, + "balance_loss_clip": 1.04481959, + "balance_loss_mlp": 1.00033069, + "epoch": 0.772079600793603, + "flos": 30446000248320.0, + "grad_norm": 1.711911543446483, + "language_loss": 0.81308931, + "learning_rate": 5.204704678183196e-07, + "loss": 0.83225459, + "num_input_tokens_seen": 138195105, + "step": 6421, + "time_per_iteration": 2.511103391647339 + }, + { + "auxiliary_loss_clip": 0.01167727, + "auxiliary_loss_mlp": 0.01023517, + "balance_loss_clip": 1.04906154, + "balance_loss_mlp": 1.01585793, + "epoch": 0.7721998436842422, + "flos": 12969857750400.0, + "grad_norm": 1.8407963179988214, + "language_loss": 0.85071194, + "learning_rate": 5.19946436842813e-07, + "loss": 0.87262434, + "num_input_tokens_seen": 138212235, + "step": 6422, + "time_per_iteration": 3.145216703414917 + }, + { + "auxiliary_loss_clip": 0.01129394, + "auxiliary_loss_mlp": 0.01022093, + "balance_loss_clip": 1.04907441, + "balance_loss_mlp": 1.01520836, + "epoch": 0.7723200865748813, + "flos": 32635509678720.0, + "grad_norm": 1.7149964127767598, + "language_loss": 0.68343997, + "learning_rate": 5.194226303902546e-07, + "loss": 0.70495486, + "num_input_tokens_seen": 138231970, + "step": 6423, + "time_per_iteration": 2.592524528503418 + }, + { + "auxiliary_loss_clip": 0.01136572, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.04452682, + "balance_loss_mlp": 1.01940012, + "epoch": 0.7724403294655203, + "flos": 21105707063040.0, + "grad_norm": 1.8414754019365667, + "language_loss": 0.71045929, + "learning_rate": 5.188990485401072e-07, + "loss": 0.73208791, + "num_input_tokens_seen": 138251175, + "step": 6424, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01153004, + "auxiliary_loss_mlp": 0.01020877, + "balance_loss_clip": 1.04669023, + "balance_loss_mlp": 1.01413012, + "epoch": 0.7725605723561595, + "flos": 22090736707200.0, + "grad_norm": 1.873710553877904, + "language_loss": 0.85851347, + "learning_rate": 5.183756913717954e-07, + "loss": 0.88025224, + "num_input_tokens_seen": 138270950, + "step": 6425, + "time_per_iteration": 2.457789897918701 + }, + { + "auxiliary_loss_clip": 0.01134962, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.04533386, + "balance_loss_mlp": 1.02292693, + "epoch": 0.7726808152467985, + "flos": 34495610457600.0, + "grad_norm": 2.513988629670602, + "language_loss": 0.73195076, + "learning_rate": 5.178525589647136e-07, + "loss": 0.75359988, + "num_input_tokens_seen": 138292590, + "step": 6426, + "time_per_iteration": 2.5548133850097656 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01023533, + "balance_loss_clip": 1.04459167, + "balance_loss_mlp": 1.01694965, + "epoch": 0.7728010581374376, + "flos": 22306344094080.0, + "grad_norm": 1.7305743299857412, + "language_loss": 0.78915191, + "learning_rate": 5.173296513982197e-07, + "loss": 0.81081617, + "num_input_tokens_seen": 138311115, + "step": 6427, + "time_per_iteration": 2.4777238368988037 + }, + { + "auxiliary_loss_clip": 0.01134783, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.04711282, + "balance_loss_mlp": 1.01950645, + "epoch": 0.7729213010280768, + "flos": 27126453968640.0, + "grad_norm": 1.9938340598284112, + "language_loss": 0.64725649, + "learning_rate": 5.168069687516398e-07, + "loss": 0.66888338, + "num_input_tokens_seen": 138330885, + "step": 6428, + "time_per_iteration": 2.551393985748291 + }, + { + "auxiliary_loss_clip": 0.01140267, + "auxiliary_loss_mlp": 0.01021382, + "balance_loss_clip": 1.04872108, + "balance_loss_mlp": 1.01407135, + "epoch": 0.7730415439187158, + "flos": 18150223080960.0, + "grad_norm": 1.8176777061761609, + "language_loss": 0.71884924, + "learning_rate": 5.16284511104263e-07, + "loss": 0.7404657, + "num_input_tokens_seen": 138350020, + "step": 6429, + "time_per_iteration": 2.4699227809906006 + }, + { + "auxiliary_loss_clip": 0.01137933, + "auxiliary_loss_mlp": 0.01025744, + "balance_loss_clip": 1.04636443, + "balance_loss_mlp": 1.01819777, + "epoch": 0.7731617868093549, + "flos": 11947480940160.0, + "grad_norm": 2.564977626660824, + "language_loss": 0.80906701, + "learning_rate": 5.157622785353457e-07, + "loss": 0.83070374, + "num_input_tokens_seen": 138368135, + "step": 6430, + "time_per_iteration": 2.431574821472168 + }, + { + "auxiliary_loss_clip": 0.01057661, + "auxiliary_loss_mlp": 0.0100141, + "balance_loss_clip": 1.01181853, + "balance_loss_mlp": 1.00029504, + "epoch": 0.7732820296999939, + "flos": 64201027069440.0, + "grad_norm": 0.6419605017811381, + "language_loss": 0.60350251, + "learning_rate": 5.152402711241113e-07, + "loss": 0.62409329, + "num_input_tokens_seen": 138436040, + "step": 6431, + "time_per_iteration": 3.090681791305542 + }, + { + "auxiliary_loss_clip": 0.01118897, + "auxiliary_loss_mlp": 0.01021826, + "balance_loss_clip": 1.03975129, + "balance_loss_mlp": 1.01476574, + "epoch": 0.7734022725906331, + "flos": 25302191984640.0, + "grad_norm": 1.810689450895245, + "language_loss": 0.8296504, + "learning_rate": 5.147184889497465e-07, + "loss": 0.85105759, + "num_input_tokens_seen": 138455510, + "step": 6432, + "time_per_iteration": 2.5600802898406982 + }, + { + "auxiliary_loss_clip": 0.01116893, + "auxiliary_loss_mlp": 0.01024007, + "balance_loss_clip": 1.04305601, + "balance_loss_mlp": 1.01667523, + "epoch": 0.7735225154812722, + "flos": 17347440067200.0, + "grad_norm": 2.6104377427408134, + "language_loss": 0.8023839, + "learning_rate": 5.141969320914072e-07, + "loss": 0.82379293, + "num_input_tokens_seen": 138473015, + "step": 6433, + "time_per_iteration": 2.4652419090270996 + }, + { + "auxiliary_loss_clip": 0.01169906, + "auxiliary_loss_mlp": 0.0102564, + "balance_loss_clip": 1.04759467, + "balance_loss_mlp": 1.01796913, + "epoch": 0.7736427583719112, + "flos": 32630086725120.0, + "grad_norm": 2.1434589849239587, + "language_loss": 0.62281764, + "learning_rate": 5.136756006282113e-07, + "loss": 0.64477313, + "num_input_tokens_seen": 138491680, + "step": 6434, + "time_per_iteration": 2.4845428466796875 + }, + { + "auxiliary_loss_clip": 0.01168518, + "auxiliary_loss_mlp": 0.01025235, + "balance_loss_clip": 1.04943657, + "balance_loss_mlp": 1.0178616, + "epoch": 0.7737630012625504, + "flos": 19860073269120.0, + "grad_norm": 2.137264941480943, + "language_loss": 0.85020149, + "learning_rate": 5.131544946392446e-07, + "loss": 0.87213904, + "num_input_tokens_seen": 138506960, + "step": 6435, + "time_per_iteration": 2.3737845420837402 + }, + { + "auxiliary_loss_clip": 0.01143483, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.05161285, + "balance_loss_mlp": 1.01839757, + "epoch": 0.7738832441531894, + "flos": 36022639397760.0, + "grad_norm": 2.589393978676421, + "language_loss": 0.6365037, + "learning_rate": 5.126336142035592e-07, + "loss": 0.6581955, + "num_input_tokens_seen": 138526995, + "step": 6436, + "time_per_iteration": 2.6106059551239014 + }, + { + "auxiliary_loss_clip": 0.01138641, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.04440117, + "balance_loss_mlp": 1.01779437, + "epoch": 0.7740034870438285, + "flos": 13405274415360.0, + "grad_norm": 2.814142197144034, + "language_loss": 0.72009242, + "learning_rate": 5.121129594001721e-07, + "loss": 0.74173212, + "num_input_tokens_seen": 138541260, + "step": 6437, + "time_per_iteration": 2.4573123455047607 + }, + { + "auxiliary_loss_clip": 0.01153181, + "auxiliary_loss_mlp": 0.01025154, + "balance_loss_clip": 1.04819942, + "balance_loss_mlp": 1.01763141, + "epoch": 0.7741237299344677, + "flos": 22086714384000.0, + "grad_norm": 1.5735089788083296, + "language_loss": 0.80978227, + "learning_rate": 5.115925303080661e-07, + "loss": 0.83156556, + "num_input_tokens_seen": 138560970, + "step": 6438, + "time_per_iteration": 2.476163387298584 + }, + { + "auxiliary_loss_clip": 0.01141201, + "auxiliary_loss_mlp": 0.01025642, + "balance_loss_clip": 1.0459826, + "balance_loss_mlp": 1.01894593, + "epoch": 0.7742439728251067, + "flos": 19864777950720.0, + "grad_norm": 2.1021872459072335, + "language_loss": 0.79231328, + "learning_rate": 5.110723270061899e-07, + "loss": 0.81398171, + "num_input_tokens_seen": 138577460, + "step": 6439, + "time_per_iteration": 2.441087245941162 + }, + { + "auxiliary_loss_clip": 0.01163282, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.04651213, + "balance_loss_mlp": 1.01740479, + "epoch": 0.7743642157157458, + "flos": 16690167048960.0, + "grad_norm": 2.005066207805757, + "language_loss": 0.79437143, + "learning_rate": 5.105523495734572e-07, + "loss": 0.81624246, + "num_input_tokens_seen": 138594860, + "step": 6440, + "time_per_iteration": 2.3974616527557373 + }, + { + "auxiliary_loss_clip": 0.0116725, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.04725432, + "balance_loss_mlp": 1.02051222, + "epoch": 0.7744844586063849, + "flos": 20304360593280.0, + "grad_norm": 1.521867159672832, + "language_loss": 0.7495482, + "learning_rate": 5.100325980887499e-07, + "loss": 0.77150285, + "num_input_tokens_seen": 138614785, + "step": 6441, + "time_per_iteration": 2.4130403995513916 + }, + { + "auxiliary_loss_clip": 0.01148242, + "auxiliary_loss_mlp": 0.01023467, + "balance_loss_clip": 1.04713202, + "balance_loss_mlp": 1.01623058, + "epoch": 0.774604701497024, + "flos": 22966705681920.0, + "grad_norm": 1.7288709964931221, + "language_loss": 0.83163971, + "learning_rate": 5.095130726309116e-07, + "loss": 0.85335678, + "num_input_tokens_seen": 138634960, + "step": 6442, + "time_per_iteration": 2.4730496406555176 + }, + { + "auxiliary_loss_clip": 0.0106572, + "auxiliary_loss_mlp": 0.01000898, + "balance_loss_clip": 1.01094031, + "balance_loss_mlp": 0.99984854, + "epoch": 0.774724944387663, + "flos": 60288523073280.0, + "grad_norm": 0.8013333376209394, + "language_loss": 0.59004474, + "learning_rate": 5.089937732787559e-07, + "loss": 0.61071086, + "num_input_tokens_seen": 138699520, + "step": 6443, + "time_per_iteration": 3.8609416484832764 + }, + { + "auxiliary_loss_clip": 0.01125663, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.0437777, + "balance_loss_mlp": 1.02057743, + "epoch": 0.7748451872783022, + "flos": 26761026954240.0, + "grad_norm": 2.680038287115145, + "language_loss": 0.66918653, + "learning_rate": 5.084747001110592e-07, + "loss": 0.69072556, + "num_input_tokens_seen": 138719145, + "step": 6444, + "time_per_iteration": 3.225616455078125 + }, + { + "auxiliary_loss_clip": 0.01152046, + "auxiliary_loss_mlp": 0.00761735, + "balance_loss_clip": 1.05140138, + "balance_loss_mlp": 1.00038409, + "epoch": 0.7749654301689413, + "flos": 30338627518080.0, + "grad_norm": 1.7290778018416668, + "language_loss": 0.70300925, + "learning_rate": 5.07955853206564e-07, + "loss": 0.72214717, + "num_input_tokens_seen": 138743850, + "step": 6445, + "time_per_iteration": 3.3695642948150635 + }, + { + "auxiliary_loss_clip": 0.01156388, + "auxiliary_loss_mlp": 0.01024178, + "balance_loss_clip": 1.04747105, + "balance_loss_mlp": 1.0171237, + "epoch": 0.7750856730595803, + "flos": 43179851687040.0, + "grad_norm": 1.5053421106051914, + "language_loss": 0.70791072, + "learning_rate": 5.074372326439807e-07, + "loss": 0.72971636, + "num_input_tokens_seen": 138766860, + "step": 6446, + "time_per_iteration": 2.6301259994506836 + }, + { + "auxiliary_loss_clip": 0.01126631, + "auxiliary_loss_mlp": 0.01024101, + "balance_loss_clip": 1.04497623, + "balance_loss_mlp": 1.01697826, + "epoch": 0.7752059159502195, + "flos": 17640040256640.0, + "grad_norm": 2.160646410047585, + "language_loss": 0.73268616, + "learning_rate": 5.069188385019814e-07, + "loss": 0.75419348, + "num_input_tokens_seen": 138784560, + "step": 6447, + "time_per_iteration": 2.468282461166382 + }, + { + "auxiliary_loss_clip": 0.01116939, + "auxiliary_loss_mlp": 0.01023589, + "balance_loss_clip": 1.04172969, + "balance_loss_mlp": 1.01630569, + "epoch": 0.7753261588408585, + "flos": 12677688524160.0, + "grad_norm": 2.5037220948534036, + "language_loss": 0.61468381, + "learning_rate": 5.064006708592077e-07, + "loss": 0.63608903, + "num_input_tokens_seen": 138800805, + "step": 6448, + "time_per_iteration": 2.5272462368011475 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01021799, + "balance_loss_clip": 1.04613602, + "balance_loss_mlp": 1.01481926, + "epoch": 0.7754464017314976, + "flos": 16690741666560.0, + "grad_norm": 3.541752868752098, + "language_loss": 0.75716442, + "learning_rate": 5.058827297942641e-07, + "loss": 0.77870965, + "num_input_tokens_seen": 138815910, + "step": 6449, + "time_per_iteration": 3.141327142715454 + }, + { + "auxiliary_loss_clip": 0.01145791, + "auxiliary_loss_mlp": 0.0102569, + "balance_loss_clip": 1.04628992, + "balance_loss_mlp": 1.01848936, + "epoch": 0.7755666446221368, + "flos": 19718944732800.0, + "grad_norm": 1.7544060391221827, + "language_loss": 0.75124407, + "learning_rate": 5.053650153857237e-07, + "loss": 0.77295887, + "num_input_tokens_seen": 138834920, + "step": 6450, + "time_per_iteration": 2.501918315887451 + }, + { + "auxiliary_loss_clip": 0.01152202, + "auxiliary_loss_mlp": 0.01027634, + "balance_loss_clip": 1.04784274, + "balance_loss_mlp": 1.02066088, + "epoch": 0.7756868875127758, + "flos": 18693623007360.0, + "grad_norm": 1.6570058684961895, + "language_loss": 0.70000458, + "learning_rate": 5.048475277121214e-07, + "loss": 0.72180295, + "num_input_tokens_seen": 138852135, + "step": 6451, + "time_per_iteration": 2.40974497795105 + }, + { + "auxiliary_loss_clip": 0.01151228, + "auxiliary_loss_mlp": 0.01022735, + "balance_loss_clip": 1.04468381, + "balance_loss_mlp": 1.01558852, + "epoch": 0.7758071304034149, + "flos": 28404191543040.0, + "grad_norm": 1.6769634699945188, + "language_loss": 0.76976824, + "learning_rate": 5.043302668519598e-07, + "loss": 0.79150784, + "num_input_tokens_seen": 138871470, + "step": 6452, + "time_per_iteration": 2.4969372749328613 + }, + { + "auxiliary_loss_clip": 0.01155396, + "auxiliary_loss_mlp": 0.01020389, + "balance_loss_clip": 1.04569912, + "balance_loss_mlp": 1.01350188, + "epoch": 0.775927373294054, + "flos": 20595344670720.0, + "grad_norm": 2.080420852405165, + "language_loss": 0.71731466, + "learning_rate": 5.038132328837079e-07, + "loss": 0.7390725, + "num_input_tokens_seen": 138889860, + "step": 6453, + "time_per_iteration": 2.426710367202759 + }, + { + "auxiliary_loss_clip": 0.01152916, + "auxiliary_loss_mlp": 0.01020059, + "balance_loss_clip": 1.04607224, + "balance_loss_mlp": 1.01286435, + "epoch": 0.7760476161846931, + "flos": 22526368853760.0, + "grad_norm": 2.033429583875141, + "language_loss": 0.74034488, + "learning_rate": 5.032964258857993e-07, + "loss": 0.76207459, + "num_input_tokens_seen": 138909955, + "step": 6454, + "time_per_iteration": 2.439711809158325 + }, + { + "auxiliary_loss_clip": 0.01149091, + "auxiliary_loss_mlp": 0.01024304, + "balance_loss_clip": 1.04198992, + "balance_loss_mlp": 1.01694012, + "epoch": 0.7761678590753321, + "flos": 48651488403840.0, + "grad_norm": 1.5741380698309486, + "language_loss": 0.68368697, + "learning_rate": 5.027798459366329e-07, + "loss": 0.70542085, + "num_input_tokens_seen": 138935320, + "step": 6455, + "time_per_iteration": 2.666825294494629 + }, + { + "auxiliary_loss_clip": 0.01157841, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.04690766, + "balance_loss_mlp": 1.01894331, + "epoch": 0.7762881019659713, + "flos": 26177047637760.0, + "grad_norm": 1.3340643596533168, + "language_loss": 0.63644969, + "learning_rate": 5.02263493114573e-07, + "loss": 0.6582917, + "num_input_tokens_seen": 138957115, + "step": 6456, + "time_per_iteration": 2.4717013835906982 + }, + { + "auxiliary_loss_clip": 0.01163778, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.04592049, + "balance_loss_mlp": 1.0176245, + "epoch": 0.7764083448566104, + "flos": 20588341518720.0, + "grad_norm": 5.4999977880007185, + "language_loss": 0.7733652, + "learning_rate": 5.017473674979502e-07, + "loss": 0.7952534, + "num_input_tokens_seen": 138973140, + "step": 6457, + "time_per_iteration": 2.366192579269409 + }, + { + "auxiliary_loss_clip": 0.010271, + "auxiliary_loss_mlp": 0.01002277, + "balance_loss_clip": 1.0127337, + "balance_loss_mlp": 1.00142431, + "epoch": 0.7765285877472494, + "flos": 67293078560640.0, + "grad_norm": 0.7388999738542961, + "language_loss": 0.58362532, + "learning_rate": 5.01231469165061e-07, + "loss": 0.60391903, + "num_input_tokens_seen": 139028965, + "step": 6458, + "time_per_iteration": 2.9756686687469482 + }, + { + "auxiliary_loss_clip": 0.01056278, + "auxiliary_loss_mlp": 0.01001753, + "balance_loss_clip": 1.01111376, + "balance_loss_mlp": 1.00070441, + "epoch": 0.7766488306378886, + "flos": 61344476121600.0, + "grad_norm": 0.8341655305906173, + "language_loss": 0.56878179, + "learning_rate": 5.007157981941663e-07, + "loss": 0.58936214, + "num_input_tokens_seen": 139094325, + "step": 6459, + "time_per_iteration": 3.1317391395568848 + }, + { + "auxiliary_loss_clip": 0.0104721, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.01143205, + "balance_loss_mlp": 1.00048208, + "epoch": 0.7767690735285276, + "flos": 62946199393920.0, + "grad_norm": 0.8776113354078642, + "language_loss": 0.67385995, + "learning_rate": 5.002003546634928e-07, + "loss": 0.69434738, + "num_input_tokens_seen": 139150425, + "step": 6460, + "time_per_iteration": 3.0022177696228027 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01024074, + "balance_loss_clip": 1.04784656, + "balance_loss_mlp": 1.01729679, + "epoch": 0.7768893164191667, + "flos": 20886400575360.0, + "grad_norm": 1.9647584450303748, + "language_loss": 0.75964898, + "learning_rate": 4.996851386512331e-07, + "loss": 0.78100705, + "num_input_tokens_seen": 139169130, + "step": 6461, + "time_per_iteration": 2.529478073120117 + }, + { + "auxiliary_loss_clip": 0.01138187, + "auxiliary_loss_mlp": 0.01026045, + "balance_loss_clip": 1.04549611, + "balance_loss_mlp": 1.0183115, + "epoch": 0.7770095593098058, + "flos": 20704584908160.0, + "grad_norm": 1.7860559881382527, + "language_loss": 0.83080018, + "learning_rate": 4.991701502355444e-07, + "loss": 0.85244256, + "num_input_tokens_seen": 139189595, + "step": 6462, + "time_per_iteration": 2.465259552001953 + }, + { + "auxiliary_loss_clip": 0.01155056, + "auxiliary_loss_mlp": 0.01023779, + "balance_loss_clip": 1.04640627, + "balance_loss_mlp": 1.01743054, + "epoch": 0.7771298022004449, + "flos": 24717709877760.0, + "grad_norm": 1.5090409968500693, + "language_loss": 0.7594372, + "learning_rate": 4.986553894945518e-07, + "loss": 0.78122556, + "num_input_tokens_seen": 139210805, + "step": 6463, + "time_per_iteration": 2.483222723007202 + }, + { + "auxiliary_loss_clip": 0.01111352, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.0402081, + "balance_loss_mlp": 1.01786137, + "epoch": 0.777250045091084, + "flos": 25009232659200.0, + "grad_norm": 2.1081996816741966, + "language_loss": 0.86050177, + "learning_rate": 4.981408565063416e-07, + "loss": 0.88185507, + "num_input_tokens_seen": 139230750, + "step": 6464, + "time_per_iteration": 2.5557990074157715 + }, + { + "auxiliary_loss_clip": 0.01167666, + "auxiliary_loss_mlp": 0.01022888, + "balance_loss_clip": 1.04819059, + "balance_loss_mlp": 1.01584315, + "epoch": 0.777370287981723, + "flos": 20119887319680.0, + "grad_norm": 2.0370118934167842, + "language_loss": 0.7603848, + "learning_rate": 4.976265513489701e-07, + "loss": 0.78229034, + "num_input_tokens_seen": 139250720, + "step": 6465, + "time_per_iteration": 2.4163033962249756 + }, + { + "auxiliary_loss_clip": 0.01150861, + "auxiliary_loss_mlp": 0.01024172, + "balance_loss_clip": 1.04393113, + "balance_loss_mlp": 1.01696539, + "epoch": 0.7774905308723622, + "flos": 21718809331200.0, + "grad_norm": 1.8639734558211378, + "language_loss": 0.80701333, + "learning_rate": 4.971124741004562e-07, + "loss": 0.82876366, + "num_input_tokens_seen": 139269720, + "step": 6466, + "time_per_iteration": 2.438819169998169 + }, + { + "auxiliary_loss_clip": 0.01150274, + "auxiliary_loss_mlp": 0.01021797, + "balance_loss_clip": 1.04517806, + "balance_loss_mlp": 1.01493669, + "epoch": 0.7776107737630013, + "flos": 16034115093120.0, + "grad_norm": 2.0543492683344833, + "language_loss": 0.76488411, + "learning_rate": 4.965986248387846e-07, + "loss": 0.78660482, + "num_input_tokens_seen": 139288035, + "step": 6467, + "time_per_iteration": 2.401381015777588 + }, + { + "auxiliary_loss_clip": 0.01140179, + "auxiliary_loss_mlp": 0.0102372, + "balance_loss_clip": 1.04370236, + "balance_loss_mlp": 1.01692843, + "epoch": 0.7777310166536403, + "flos": 24790895838720.0, + "grad_norm": 1.6802066583015651, + "language_loss": 0.76982903, + "learning_rate": 4.960850036419073e-07, + "loss": 0.79146802, + "num_input_tokens_seen": 139307135, + "step": 6468, + "time_per_iteration": 2.494659423828125 + }, + { + "auxiliary_loss_clip": 0.01134654, + "auxiliary_loss_mlp": 0.01021582, + "balance_loss_clip": 1.0441258, + "balance_loss_mlp": 1.01435757, + "epoch": 0.7778512595442795, + "flos": 17272530253440.0, + "grad_norm": 2.1862019421500607, + "language_loss": 0.78757286, + "learning_rate": 4.955716105877378e-07, + "loss": 0.80913526, + "num_input_tokens_seen": 139325905, + "step": 6469, + "time_per_iteration": 3.275754928588867 + }, + { + "auxiliary_loss_clip": 0.01156343, + "auxiliary_loss_mlp": 0.00761567, + "balance_loss_clip": 1.04682326, + "balance_loss_mlp": 1.00036108, + "epoch": 0.7779715024349185, + "flos": 17748418567680.0, + "grad_norm": 1.7528109889418506, + "language_loss": 0.82955408, + "learning_rate": 4.950584457541598e-07, + "loss": 0.84873319, + "num_input_tokens_seen": 139344370, + "step": 6470, + "time_per_iteration": 3.13714861869812 + }, + { + "auxiliary_loss_clip": 0.01154319, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.04663897, + "balance_loss_mlp": 1.01850057, + "epoch": 0.7780917453255576, + "flos": 24316875031680.0, + "grad_norm": 1.347970465380064, + "language_loss": 0.81814688, + "learning_rate": 4.945455092190183e-07, + "loss": 0.83994293, + "num_input_tokens_seen": 139365625, + "step": 6471, + "time_per_iteration": 3.3791353702545166 + }, + { + "auxiliary_loss_clip": 0.01065539, + "auxiliary_loss_mlp": 0.01000919, + "balance_loss_clip": 1.01064253, + "balance_loss_mlp": 0.99985808, + "epoch": 0.7782119882161967, + "flos": 56364601530240.0, + "grad_norm": 0.6771088358195161, + "language_loss": 0.55961573, + "learning_rate": 4.940328010601271e-07, + "loss": 0.5802803, + "num_input_tokens_seen": 139430540, + "step": 6472, + "time_per_iteration": 3.017413377761841 + }, + { + "auxiliary_loss_clip": 0.01151504, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.05152142, + "balance_loss_mlp": 1.02137816, + "epoch": 0.7783322311068358, + "flos": 46789986994560.0, + "grad_norm": 1.719036534090131, + "language_loss": 0.76680022, + "learning_rate": 4.935203213552621e-07, + "loss": 0.78860486, + "num_input_tokens_seen": 139454280, + "step": 6473, + "time_per_iteration": 2.691930055618286 + }, + { + "auxiliary_loss_clip": 0.01141165, + "auxiliary_loss_mlp": 0.01023115, + "balance_loss_clip": 1.0461632, + "balance_loss_mlp": 1.01555395, + "epoch": 0.7784524739974749, + "flos": 19057864872960.0, + "grad_norm": 8.55752309875727, + "language_loss": 0.66779333, + "learning_rate": 4.930080701821662e-07, + "loss": 0.68943614, + "num_input_tokens_seen": 139471745, + "step": 6474, + "time_per_iteration": 2.4473488330841064 + }, + { + "auxiliary_loss_clip": 0.01139418, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.04459834, + "balance_loss_mlp": 1.01850629, + "epoch": 0.778572716888114, + "flos": 24791111320320.0, + "grad_norm": 2.3501120765167505, + "language_loss": 0.77194738, + "learning_rate": 4.92496047618548e-07, + "loss": 0.7935977, + "num_input_tokens_seen": 139491505, + "step": 6475, + "time_per_iteration": 3.2657601833343506 + }, + { + "auxiliary_loss_clip": 0.01157503, + "auxiliary_loss_mlp": 0.01022966, + "balance_loss_clip": 1.05059385, + "balance_loss_mlp": 1.01574206, + "epoch": 0.7786929597787531, + "flos": 20078086867200.0, + "grad_norm": 1.7811780938141082, + "language_loss": 0.77793294, + "learning_rate": 4.919842537420811e-07, + "loss": 0.79973757, + "num_input_tokens_seen": 139508620, + "step": 6476, + "time_per_iteration": 2.42966890335083 + }, + { + "auxiliary_loss_clip": 0.01140987, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.04864883, + "balance_loss_mlp": 1.02031803, + "epoch": 0.7788132026693921, + "flos": 21872220318720.0, + "grad_norm": 1.5867789015892955, + "language_loss": 0.79264104, + "learning_rate": 4.91472688630404e-07, + "loss": 0.81432033, + "num_input_tokens_seen": 139529360, + "step": 6477, + "time_per_iteration": 2.474210500717163 + }, + { + "auxiliary_loss_clip": 0.01163792, + "auxiliary_loss_mlp": 0.01020733, + "balance_loss_clip": 1.04691815, + "balance_loss_mlp": 1.01408958, + "epoch": 0.7789334455600313, + "flos": 11181937351680.0, + "grad_norm": 1.840047594253558, + "language_loss": 0.74176162, + "learning_rate": 4.909613523611202e-07, + "loss": 0.76360685, + "num_input_tokens_seen": 139546240, + "step": 6478, + "time_per_iteration": 2.438084363937378 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.00761989, + "balance_loss_clip": 1.03926635, + "balance_loss_mlp": 1.00046504, + "epoch": 0.7790536884506704, + "flos": 28695427015680.0, + "grad_norm": 1.7131593213837262, + "language_loss": 0.74512082, + "learning_rate": 4.904502450117991e-07, + "loss": 0.76380944, + "num_input_tokens_seen": 139567200, + "step": 6479, + "time_per_iteration": 2.602010726928711 + }, + { + "auxiliary_loss_clip": 0.01138148, + "auxiliary_loss_mlp": 0.0102528, + "balance_loss_clip": 1.0487994, + "balance_loss_mlp": 1.01820242, + "epoch": 0.7791739313413094, + "flos": 11072302064640.0, + "grad_norm": 2.2655887327138386, + "language_loss": 0.7183193, + "learning_rate": 4.899393666599762e-07, + "loss": 0.73995364, + "num_input_tokens_seen": 139583775, + "step": 6480, + "time_per_iteration": 2.4409658908843994 + }, + { + "auxiliary_loss_clip": 0.01164531, + "auxiliary_loss_mlp": 0.01019843, + "balance_loss_clip": 1.04551697, + "balance_loss_mlp": 1.01322973, + "epoch": 0.7792941742319486, + "flos": 14679276975360.0, + "grad_norm": 2.451872446933454, + "language_loss": 0.72448373, + "learning_rate": 4.894287173831506e-07, + "loss": 0.74632752, + "num_input_tokens_seen": 139599735, + "step": 6481, + "time_per_iteration": 2.357515335083008 + }, + { + "auxiliary_loss_clip": 0.01138921, + "auxiliary_loss_mlp": 0.01025041, + "balance_loss_clip": 1.04311824, + "balance_loss_mlp": 1.01717901, + "epoch": 0.7794144171225876, + "flos": 23258874908160.0, + "grad_norm": 2.7050119523459117, + "language_loss": 0.84315956, + "learning_rate": 4.889182972587877e-07, + "loss": 0.8647992, + "num_input_tokens_seen": 139619030, + "step": 6482, + "time_per_iteration": 2.469722270965576 + }, + { + "auxiliary_loss_clip": 0.01132318, + "auxiliary_loss_mlp": 0.01025619, + "balance_loss_clip": 1.04506886, + "balance_loss_mlp": 1.01877379, + "epoch": 0.7795346600132267, + "flos": 21507080613120.0, + "grad_norm": 1.7858208545962788, + "language_loss": 0.66032982, + "learning_rate": 4.884081063643177e-07, + "loss": 0.68190914, + "num_input_tokens_seen": 139637690, + "step": 6483, + "time_per_iteration": 2.518369436264038 + }, + { + "auxiliary_loss_clip": 0.01040475, + "auxiliary_loss_mlp": 0.0100151, + "balance_loss_clip": 1.01195192, + "balance_loss_mlp": 1.00052702, + "epoch": 0.7796549029038659, + "flos": 70052273694720.0, + "grad_norm": 0.8648663077288179, + "language_loss": 0.52530181, + "learning_rate": 4.878981447771353e-07, + "loss": 0.54572165, + "num_input_tokens_seen": 139692070, + "step": 6484, + "time_per_iteration": 3.0795040130615234 + }, + { + "auxiliary_loss_clip": 0.01118967, + "auxiliary_loss_mlp": 0.01025686, + "balance_loss_clip": 1.04298294, + "balance_loss_mlp": 1.01744246, + "epoch": 0.7797751457945049, + "flos": 23989405714560.0, + "grad_norm": 1.651649588835177, + "language_loss": 0.73228383, + "learning_rate": 4.873884125746035e-07, + "loss": 0.75373042, + "num_input_tokens_seen": 139713745, + "step": 6485, + "time_per_iteration": 2.5281119346618652 + }, + { + "auxiliary_loss_clip": 0.0113465, + "auxiliary_loss_mlp": 0.01019535, + "balance_loss_clip": 1.04448938, + "balance_loss_mlp": 1.01248991, + "epoch": 0.779895388685144, + "flos": 22674751937280.0, + "grad_norm": 2.448838597906675, + "language_loss": 0.72434759, + "learning_rate": 4.868789098340456e-07, + "loss": 0.74588943, + "num_input_tokens_seen": 139731650, + "step": 6486, + "time_per_iteration": 2.473839521408081 + }, + { + "auxiliary_loss_clip": 0.01123394, + "auxiliary_loss_mlp": 0.01023626, + "balance_loss_clip": 1.04187012, + "balance_loss_mlp": 1.01695895, + "epoch": 0.7800156315757831, + "flos": 23768698596480.0, + "grad_norm": 4.081162562167619, + "language_loss": 0.73117936, + "learning_rate": 4.863696366327543e-07, + "loss": 0.75264961, + "num_input_tokens_seen": 139750820, + "step": 6487, + "time_per_iteration": 2.516204595565796 + }, + { + "auxiliary_loss_clip": 0.01153253, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.04431617, + "balance_loss_mlp": 1.01979876, + "epoch": 0.7801358744664222, + "flos": 26429714881920.0, + "grad_norm": 1.8289264418598066, + "language_loss": 0.7812897, + "learning_rate": 4.85860593047986e-07, + "loss": 0.80309165, + "num_input_tokens_seen": 139770885, + "step": 6488, + "time_per_iteration": 2.492370367050171 + }, + { + "auxiliary_loss_clip": 0.01115999, + "auxiliary_loss_mlp": 0.01024151, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.0173409, + "epoch": 0.7802561173570612, + "flos": 26322162583680.0, + "grad_norm": 2.1004130141492454, + "language_loss": 0.74782205, + "learning_rate": 4.853517791569613e-07, + "loss": 0.76922357, + "num_input_tokens_seen": 139793065, + "step": 6489, + "time_per_iteration": 2.557070016860962 + }, + { + "auxiliary_loss_clip": 0.01144002, + "auxiliary_loss_mlp": 0.00762092, + "balance_loss_clip": 1.04451227, + "balance_loss_mlp": 1.00033188, + "epoch": 0.7803763602477004, + "flos": 40333751596800.0, + "grad_norm": 1.714467869202251, + "language_loss": 0.66034502, + "learning_rate": 4.848431950368684e-07, + "loss": 0.67940599, + "num_input_tokens_seen": 139815625, + "step": 6490, + "time_per_iteration": 2.636080026626587 + }, + { + "auxiliary_loss_clip": 0.01065659, + "auxiliary_loss_mlp": 0.00752864, + "balance_loss_clip": 1.01103115, + "balance_loss_mlp": 0.99989241, + "epoch": 0.7804966031383395, + "flos": 67001448038400.0, + "grad_norm": 0.712297458260766, + "language_loss": 0.55715853, + "learning_rate": 4.843348407648569e-07, + "loss": 0.57534379, + "num_input_tokens_seen": 139876905, + "step": 6491, + "time_per_iteration": 2.953810453414917 + }, + { + "auxiliary_loss_clip": 0.01153366, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.04263616, + "balance_loss_mlp": 1.01674891, + "epoch": 0.7806168460289785, + "flos": 17740733057280.0, + "grad_norm": 2.377935191854429, + "language_loss": 0.82714969, + "learning_rate": 4.838267164180457e-07, + "loss": 0.84892911, + "num_input_tokens_seen": 139892575, + "step": 6492, + "time_per_iteration": 2.399416208267212 + }, + { + "auxiliary_loss_clip": 0.01168954, + "auxiliary_loss_mlp": 0.01024881, + "balance_loss_clip": 1.04785323, + "balance_loss_mlp": 1.01714444, + "epoch": 0.7807370889196176, + "flos": 23946240545280.0, + "grad_norm": 2.0274687536790608, + "language_loss": 0.83741719, + "learning_rate": 4.833188220735156e-07, + "loss": 0.85935557, + "num_input_tokens_seen": 139912245, + "step": 6493, + "time_per_iteration": 2.419259548187256 + }, + { + "auxiliary_loss_clip": 0.01151573, + "auxiliary_loss_mlp": 0.0102186, + "balance_loss_clip": 1.046157, + "balance_loss_mlp": 1.01484728, + "epoch": 0.7808573318102567, + "flos": 18989024457600.0, + "grad_norm": 2.023430002560144, + "language_loss": 0.74718851, + "learning_rate": 4.828111578083152e-07, + "loss": 0.76892281, + "num_input_tokens_seen": 139929150, + "step": 6494, + "time_per_iteration": 2.4137871265411377 + }, + { + "auxiliary_loss_clip": 0.0113444, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.0453341, + "balance_loss_mlp": 1.01955867, + "epoch": 0.7809775747008958, + "flos": 23980750536960.0, + "grad_norm": 2.6199241854478497, + "language_loss": 0.8170386, + "learning_rate": 4.823037236994556e-07, + "loss": 0.83865201, + "num_input_tokens_seen": 139947315, + "step": 6495, + "time_per_iteration": 2.527867078781128 + }, + { + "auxiliary_loss_clip": 0.01056259, + "auxiliary_loss_mlp": 0.01000758, + "balance_loss_clip": 1.01057625, + "balance_loss_mlp": 0.99972039, + "epoch": 0.7810978175915348, + "flos": 68535875180160.0, + "grad_norm": 0.7132369122011739, + "language_loss": 0.56343669, + "learning_rate": 4.817965198239136e-07, + "loss": 0.58400691, + "num_input_tokens_seen": 140013775, + "step": 6496, + "time_per_iteration": 3.8623228073120117 + }, + { + "auxiliary_loss_clip": 0.0112264, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.04158223, + "balance_loss_mlp": 1.01798308, + "epoch": 0.781218060482174, + "flos": 19642131498240.0, + "grad_norm": 2.1266018211535296, + "language_loss": 0.74443281, + "learning_rate": 4.812895462586331e-07, + "loss": 0.76591718, + "num_input_tokens_seen": 140031600, + "step": 6497, + "time_per_iteration": 3.2123382091522217 + }, + { + "auxiliary_loss_clip": 0.01127468, + "auxiliary_loss_mlp": 0.01022041, + "balance_loss_clip": 1.04589307, + "balance_loss_mlp": 1.0156728, + "epoch": 0.7813383033728131, + "flos": 25627865621760.0, + "grad_norm": 1.7443390149669613, + "language_loss": 0.82332885, + "learning_rate": 4.807828030805207e-07, + "loss": 0.8448239, + "num_input_tokens_seen": 140050590, + "step": 6498, + "time_per_iteration": 3.3967041969299316 + }, + { + "auxiliary_loss_clip": 0.01151899, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.04835701, + "balance_loss_mlp": 1.02563453, + "epoch": 0.7814585462634521, + "flos": 20485924865280.0, + "grad_norm": 1.7733321957795256, + "language_loss": 0.68363786, + "learning_rate": 4.802762903664495e-07, + "loss": 0.70548934, + "num_input_tokens_seen": 140069770, + "step": 6499, + "time_per_iteration": 2.540027618408203 + }, + { + "auxiliary_loss_clip": 0.01146462, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.04868495, + "balance_loss_mlp": 1.02036452, + "epoch": 0.7815787891540913, + "flos": 22304297018880.0, + "grad_norm": 2.278488497321996, + "language_loss": 0.73927927, + "learning_rate": 4.797700081932565e-07, + "loss": 0.76102424, + "num_input_tokens_seen": 140087635, + "step": 6500, + "time_per_iteration": 2.468290328979492 + }, + { + "auxiliary_loss_clip": 0.01089879, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.03757548, + "balance_loss_mlp": 1.01852059, + "epoch": 0.7816990320447303, + "flos": 22600668136320.0, + "grad_norm": 2.353676989737286, + "language_loss": 0.82056361, + "learning_rate": 4.792639566377442e-07, + "loss": 0.84171796, + "num_input_tokens_seen": 140105045, + "step": 6501, + "time_per_iteration": 2.5547449588775635 + }, + { + "auxiliary_loss_clip": 0.01146882, + "auxiliary_loss_mlp": 0.01020525, + "balance_loss_clip": 1.04369044, + "balance_loss_mlp": 1.01303852, + "epoch": 0.7818192749353694, + "flos": 24935974871040.0, + "grad_norm": 1.7143268110907315, + "language_loss": 0.77669728, + "learning_rate": 4.78758135776681e-07, + "loss": 0.79837132, + "num_input_tokens_seen": 140124900, + "step": 6502, + "time_per_iteration": 3.249781608581543 + }, + { + "auxiliary_loss_clip": 0.01139391, + "auxiliary_loss_mlp": 0.0102537, + "balance_loss_clip": 1.04566169, + "balance_loss_mlp": 1.0186348, + "epoch": 0.7819395178260086, + "flos": 23733039369600.0, + "grad_norm": 2.052509956360546, + "language_loss": 0.78826964, + "learning_rate": 4.782525456867989e-07, + "loss": 0.80991721, + "num_input_tokens_seen": 140143755, + "step": 6503, + "time_per_iteration": 2.4841930866241455 + }, + { + "auxiliary_loss_clip": 0.01125685, + "auxiliary_loss_mlp": 0.01025145, + "balance_loss_clip": 1.04330504, + "balance_loss_mlp": 1.01759315, + "epoch": 0.7820597607166476, + "flos": 23221671396480.0, + "grad_norm": 2.2475392216689696, + "language_loss": 0.83056802, + "learning_rate": 4.777471864447959e-07, + "loss": 0.85207629, + "num_input_tokens_seen": 140164495, + "step": 6504, + "time_per_iteration": 2.5042660236358643 + }, + { + "auxiliary_loss_clip": 0.01139824, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.04402256, + "balance_loss_mlp": 1.02259493, + "epoch": 0.7821800036072867, + "flos": 22309540404480.0, + "grad_norm": 2.289048015419642, + "language_loss": 0.80342984, + "learning_rate": 4.772420581273344e-07, + "loss": 0.82512403, + "num_input_tokens_seen": 140181980, + "step": 6505, + "time_per_iteration": 2.4771053791046143 + }, + { + "auxiliary_loss_clip": 0.01148002, + "auxiliary_loss_mlp": 0.01022983, + "balance_loss_clip": 1.0461762, + "balance_loss_mlp": 1.01564002, + "epoch": 0.7823002464979258, + "flos": 21544176384000.0, + "grad_norm": 1.8111270710972136, + "language_loss": 0.75861347, + "learning_rate": 4.7673716081104134e-07, + "loss": 0.78032333, + "num_input_tokens_seen": 140202155, + "step": 6506, + "time_per_iteration": 2.4428014755249023 + }, + { + "auxiliary_loss_clip": 0.01153807, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.04919577, + "balance_loss_mlp": 1.01808512, + "epoch": 0.7824204893885649, + "flos": 24535642815360.0, + "grad_norm": 1.6172844939184783, + "language_loss": 0.84568745, + "learning_rate": 4.762324945725109e-07, + "loss": 0.86747611, + "num_input_tokens_seen": 140221600, + "step": 6507, + "time_per_iteration": 2.466966152191162 + }, + { + "auxiliary_loss_clip": 0.01134125, + "auxiliary_loss_mlp": 0.01026296, + "balance_loss_clip": 1.0478878, + "balance_loss_mlp": 1.0193938, + "epoch": 0.782540732279204, + "flos": 27415211402880.0, + "grad_norm": 1.6534168662577466, + "language_loss": 0.75591129, + "learning_rate": 4.7572805948829844e-07, + "loss": 0.77751541, + "num_input_tokens_seen": 140241860, + "step": 6508, + "time_per_iteration": 2.501706123352051 + }, + { + "auxiliary_loss_clip": 0.01115915, + "auxiliary_loss_mlp": 0.01021402, + "balance_loss_clip": 1.04416871, + "balance_loss_mlp": 1.01469302, + "epoch": 0.7826609751698431, + "flos": 24353216616960.0, + "grad_norm": 2.0298483841624435, + "language_loss": 0.71122849, + "learning_rate": 4.7522385563492795e-07, + "loss": 0.7326017, + "num_input_tokens_seen": 140262160, + "step": 6509, + "time_per_iteration": 2.5437140464782715 + }, + { + "auxiliary_loss_clip": 0.01131003, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.0479269, + "balance_loss_mlp": 1.01852536, + "epoch": 0.7827812180604822, + "flos": 23988543788160.0, + "grad_norm": 1.8421312027599637, + "language_loss": 0.70350665, + "learning_rate": 4.747198830888863e-07, + "loss": 0.72507119, + "num_input_tokens_seen": 140282030, + "step": 6510, + "time_per_iteration": 2.5157268047332764 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01026722, + "balance_loss_clip": 1.04472017, + "balance_loss_mlp": 1.01967096, + "epoch": 0.7829014609511212, + "flos": 27454318335360.0, + "grad_norm": 2.8099577356954786, + "language_loss": 0.68423414, + "learning_rate": 4.742161419266251e-07, + "loss": 0.70583677, + "num_input_tokens_seen": 140301190, + "step": 6511, + "time_per_iteration": 2.5029919147491455 + }, + { + "auxiliary_loss_clip": 0.0115644, + "auxiliary_loss_mlp": 0.01027691, + "balance_loss_clip": 1.04618263, + "balance_loss_mlp": 1.01999009, + "epoch": 0.7830217038417604, + "flos": 29204532432000.0, + "grad_norm": 3.2335599872123586, + "language_loss": 0.64929855, + "learning_rate": 4.7371263222456304e-07, + "loss": 0.67113984, + "num_input_tokens_seen": 140318510, + "step": 6512, + "time_per_iteration": 2.479785680770874 + }, + { + "auxiliary_loss_clip": 0.01051348, + "auxiliary_loss_mlp": 0.01001284, + "balance_loss_clip": 1.01059556, + "balance_loss_mlp": 1.00031281, + "epoch": 0.7831419467323995, + "flos": 60950895822720.0, + "grad_norm": 0.8085924692440947, + "language_loss": 0.61353695, + "learning_rate": 4.7320935405908004e-07, + "loss": 0.63406324, + "num_input_tokens_seen": 140379380, + "step": 6513, + "time_per_iteration": 2.9893112182617188 + }, + { + "auxiliary_loss_clip": 0.01168898, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.04777169, + "balance_loss_mlp": 1.01790285, + "epoch": 0.7832621896230385, + "flos": 19682531320320.0, + "grad_norm": 2.0746706653203, + "language_loss": 0.84062672, + "learning_rate": 4.7270630750652475e-07, + "loss": 0.86257219, + "num_input_tokens_seen": 140395335, + "step": 6514, + "time_per_iteration": 2.3778579235076904 + }, + { + "auxiliary_loss_clip": 0.01149626, + "auxiliary_loss_mlp": 0.01020532, + "balance_loss_clip": 1.04434681, + "balance_loss_mlp": 1.01382351, + "epoch": 0.7833824325136777, + "flos": 25009232659200.0, + "grad_norm": 1.6879473377035084, + "language_loss": 0.80741167, + "learning_rate": 4.7220349264320746e-07, + "loss": 0.82911325, + "num_input_tokens_seen": 140414420, + "step": 6515, + "time_per_iteration": 2.4611709117889404 + }, + { + "auxiliary_loss_clip": 0.01054559, + "auxiliary_loss_mlp": 0.01000669, + "balance_loss_clip": 1.01051331, + "balance_loss_mlp": 0.99963778, + "epoch": 0.7835026754043167, + "flos": 68800142517120.0, + "grad_norm": 0.7329849561181503, + "language_loss": 0.54916102, + "learning_rate": 4.71700909545407e-07, + "loss": 0.56971329, + "num_input_tokens_seen": 140477365, + "step": 6516, + "time_per_iteration": 3.0265395641326904 + }, + { + "auxiliary_loss_clip": 0.01154139, + "auxiliary_loss_mlp": 0.01021434, + "balance_loss_clip": 1.04637933, + "balance_loss_mlp": 1.01434982, + "epoch": 0.7836229182949558, + "flos": 19864598382720.0, + "grad_norm": 1.8738135362949782, + "language_loss": 0.77109027, + "learning_rate": 4.711985582893627e-07, + "loss": 0.79284596, + "num_input_tokens_seen": 140495885, + "step": 6517, + "time_per_iteration": 2.413577079772949 + }, + { + "auxiliary_loss_clip": 0.01112179, + "auxiliary_loss_mlp": 0.01021841, + "balance_loss_clip": 1.04055035, + "balance_loss_mlp": 1.0145036, + "epoch": 0.783743161185595, + "flos": 22965843755520.0, + "grad_norm": 1.6402273495031856, + "language_loss": 0.71610463, + "learning_rate": 4.706964389512811e-07, + "loss": 0.73744488, + "num_input_tokens_seen": 140515920, + "step": 6518, + "time_per_iteration": 2.5515036582946777 + }, + { + "auxiliary_loss_clip": 0.01166309, + "auxiliary_loss_mlp": 0.01020135, + "balance_loss_clip": 1.04967618, + "balance_loss_mlp": 1.01331663, + "epoch": 0.783863404076234, + "flos": 12458489777280.0, + "grad_norm": 1.9073058921881056, + "language_loss": 0.87306106, + "learning_rate": 4.701945516073345e-07, + "loss": 0.89492559, + "num_input_tokens_seen": 140533395, + "step": 6519, + "time_per_iteration": 2.369558572769165 + }, + { + "auxiliary_loss_clip": 0.01122572, + "auxiliary_loss_mlp": 0.01020777, + "balance_loss_clip": 1.04431868, + "balance_loss_mlp": 1.01414037, + "epoch": 0.7839836469668731, + "flos": 24243940465920.0, + "grad_norm": 1.7950173901195265, + "language_loss": 0.75393915, + "learning_rate": 4.696928963336577e-07, + "loss": 0.77537262, + "num_input_tokens_seen": 140552825, + "step": 6520, + "time_per_iteration": 2.5475728511810303 + }, + { + "auxiliary_loss_clip": 0.01051419, + "auxiliary_loss_mlp": 0.01001331, + "balance_loss_clip": 1.01031482, + "balance_loss_mlp": 1.00031209, + "epoch": 0.7841038898575122, + "flos": 62121978938880.0, + "grad_norm": 0.8534967495046033, + "language_loss": 0.61017454, + "learning_rate": 4.6919147320635224e-07, + "loss": 0.63070214, + "num_input_tokens_seen": 140615535, + "step": 6521, + "time_per_iteration": 2.9999747276306152 + }, + { + "auxiliary_loss_clip": 0.01154218, + "auxiliary_loss_mlp": 0.01024606, + "balance_loss_clip": 1.04555976, + "balance_loss_mlp": 1.01783764, + "epoch": 0.7842241327481513, + "flos": 20193899293440.0, + "grad_norm": 2.342963454017732, + "language_loss": 0.72994047, + "learning_rate": 4.6869028230148286e-07, + "loss": 0.75172877, + "num_input_tokens_seen": 140633330, + "step": 6522, + "time_per_iteration": 3.2662525177001953 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.01024257, + "balance_loss_clip": 1.04025996, + "balance_loss_mlp": 1.01652002, + "epoch": 0.7843443756387903, + "flos": 28074531496320.0, + "grad_norm": 2.34381135662973, + "language_loss": 0.59847081, + "learning_rate": 4.6818932369507957e-07, + "loss": 0.61989844, + "num_input_tokens_seen": 140652830, + "step": 6523, + "time_per_iteration": 3.2541370391845703 + }, + { + "auxiliary_loss_clip": 0.01153993, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.04981923, + "balance_loss_mlp": 1.01931953, + "epoch": 0.7844646185294295, + "flos": 21323397438720.0, + "grad_norm": 2.20927760909072, + "language_loss": 0.88968349, + "learning_rate": 4.676885974631386e-07, + "loss": 0.91148812, + "num_input_tokens_seen": 140671190, + "step": 6524, + "time_per_iteration": 2.43854022026062 + }, + { + "auxiliary_loss_clip": 0.01153788, + "auxiliary_loss_mlp": 0.01022298, + "balance_loss_clip": 1.04735041, + "balance_loss_mlp": 1.01559854, + "epoch": 0.7845848614200686, + "flos": 23656585271040.0, + "grad_norm": 1.9212038267190354, + "language_loss": 0.81113577, + "learning_rate": 4.67188103681619e-07, + "loss": 0.83289659, + "num_input_tokens_seen": 140690975, + "step": 6525, + "time_per_iteration": 3.256356716156006 + }, + { + "auxiliary_loss_clip": 0.01152803, + "auxiliary_loss_mlp": 0.00761716, + "balance_loss_clip": 1.05069482, + "balance_loss_mlp": 1.00032806, + "epoch": 0.7847051043107076, + "flos": 23402194174080.0, + "grad_norm": 4.33744629859235, + "language_loss": 0.6875186, + "learning_rate": 4.666878424264453e-07, + "loss": 0.70666373, + "num_input_tokens_seen": 140710930, + "step": 6526, + "time_per_iteration": 2.444742441177368 + }, + { + "auxiliary_loss_clip": 0.01130504, + "auxiliary_loss_mlp": 0.01019782, + "balance_loss_clip": 1.0446167, + "balance_loss_mlp": 1.01362491, + "epoch": 0.7848253472013467, + "flos": 19022277473280.0, + "grad_norm": 1.8657727553705894, + "language_loss": 0.73486632, + "learning_rate": 4.661878137735069e-07, + "loss": 0.75636923, + "num_input_tokens_seen": 140729120, + "step": 6527, + "time_per_iteration": 2.444528102874756 + }, + { + "auxiliary_loss_clip": 0.01137561, + "auxiliary_loss_mlp": 0.01021878, + "balance_loss_clip": 1.04501319, + "balance_loss_mlp": 1.01512766, + "epoch": 0.7849455900919858, + "flos": 21179180332800.0, + "grad_norm": 2.172175631590525, + "language_loss": 0.74662203, + "learning_rate": 4.656880177986571e-07, + "loss": 0.76821643, + "num_input_tokens_seen": 140747665, + "step": 6528, + "time_per_iteration": 2.4619085788726807 + }, + { + "auxiliary_loss_clip": 0.01140805, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.04357326, + "balance_loss_mlp": 1.0165273, + "epoch": 0.7850658329826249, + "flos": 19536482620800.0, + "grad_norm": 1.8828479089549728, + "language_loss": 0.81276512, + "learning_rate": 4.6518845457771607e-07, + "loss": 0.83441389, + "num_input_tokens_seen": 140766525, + "step": 6529, + "time_per_iteration": 3.2345666885375977 + }, + { + "auxiliary_loss_clip": 0.01144647, + "auxiliary_loss_mlp": 0.00761501, + "balance_loss_clip": 1.04440784, + "balance_loss_mlp": 1.00037599, + "epoch": 0.7851860758732639, + "flos": 12495334152960.0, + "grad_norm": 1.7715898218398218, + "language_loss": 0.78886473, + "learning_rate": 4.646891241864652e-07, + "loss": 0.8079263, + "num_input_tokens_seen": 140785090, + "step": 6530, + "time_per_iteration": 2.420994520187378 + }, + { + "auxiliary_loss_clip": 0.01151563, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.04540133, + "balance_loss_mlp": 1.02001297, + "epoch": 0.7853063187639031, + "flos": 22960959505920.0, + "grad_norm": 2.0034811796725855, + "language_loss": 0.73514444, + "learning_rate": 4.6419002670065397e-07, + "loss": 0.75693864, + "num_input_tokens_seen": 140804670, + "step": 6531, + "time_per_iteration": 2.4273664951324463 + }, + { + "auxiliary_loss_clip": 0.01129057, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.04596663, + "balance_loss_mlp": 1.02072835, + "epoch": 0.7854265616545422, + "flos": 17347260499200.0, + "grad_norm": 3.4046561018796853, + "language_loss": 0.86753833, + "learning_rate": 4.6369116219599445e-07, + "loss": 0.88911295, + "num_input_tokens_seen": 140820655, + "step": 6532, + "time_per_iteration": 2.481265068054199 + }, + { + "auxiliary_loss_clip": 0.01123649, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.04358661, + "balance_loss_mlp": 1.01591802, + "epoch": 0.7855468045451812, + "flos": 23838293197440.0, + "grad_norm": 1.657003108576288, + "language_loss": 0.79092884, + "learning_rate": 4.631925307481637e-07, + "loss": 0.81239045, + "num_input_tokens_seen": 140840470, + "step": 6533, + "time_per_iteration": 2.51924467086792 + }, + { + "auxiliary_loss_clip": 0.01137392, + "auxiliary_loss_mlp": 0.01022508, + "balance_loss_clip": 1.04688919, + "balance_loss_mlp": 1.01583219, + "epoch": 0.7856670474358204, + "flos": 25666792986240.0, + "grad_norm": 1.94763198897157, + "language_loss": 0.75912499, + "learning_rate": 4.6269413243280533e-07, + "loss": 0.78072405, + "num_input_tokens_seen": 140859890, + "step": 6534, + "time_per_iteration": 2.507854461669922 + }, + { + "auxiliary_loss_clip": 0.01146527, + "auxiliary_loss_mlp": 0.0102237, + "balance_loss_clip": 1.05017853, + "balance_loss_mlp": 1.01415014, + "epoch": 0.7857872903264594, + "flos": 18144656472960.0, + "grad_norm": 2.7598073972263037, + "language_loss": 0.74365258, + "learning_rate": 4.621959673255236e-07, + "loss": 0.76534152, + "num_input_tokens_seen": 140876190, + "step": 6535, + "time_per_iteration": 2.435556411743164 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01026101, + "balance_loss_clip": 1.04113913, + "balance_loss_mlp": 1.01904368, + "epoch": 0.7859075332170985, + "flos": 14386138081920.0, + "grad_norm": 2.0730766876488294, + "language_loss": 0.90412748, + "learning_rate": 4.6169803550189135e-07, + "loss": 0.92546058, + "num_input_tokens_seen": 140891885, + "step": 6536, + "time_per_iteration": 2.4852495193481445 + }, + { + "auxiliary_loss_clip": 0.01104958, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.04470956, + "balance_loss_mlp": 1.02053666, + "epoch": 0.7860277761077377, + "flos": 19864059678720.0, + "grad_norm": 2.048396562476462, + "language_loss": 0.77154273, + "learning_rate": 4.6120033703744355e-07, + "loss": 0.7928766, + "num_input_tokens_seen": 140910780, + "step": 6537, + "time_per_iteration": 2.513505697250366 + }, + { + "auxiliary_loss_clip": 0.01126414, + "auxiliary_loss_mlp": 0.0102377, + "balance_loss_clip": 1.04220009, + "balance_loss_mlp": 1.01680577, + "epoch": 0.7861480189983767, + "flos": 26396174557440.0, + "grad_norm": 1.7797252122112697, + "language_loss": 0.78226602, + "learning_rate": 4.607028720076822e-07, + "loss": 0.80376792, + "num_input_tokens_seen": 140927460, + "step": 6538, + "time_per_iteration": 2.5055649280548096 + }, + { + "auxiliary_loss_clip": 0.01152233, + "auxiliary_loss_mlp": 0.01024347, + "balance_loss_clip": 1.04643404, + "balance_loss_mlp": 1.0172925, + "epoch": 0.7862682618890158, + "flos": 24236578177920.0, + "grad_norm": 1.882501517667879, + "language_loss": 0.73534894, + "learning_rate": 4.6020564048807074e-07, + "loss": 0.75711471, + "num_input_tokens_seen": 140945135, + "step": 6539, + "time_per_iteration": 2.4453020095825195 + }, + { + "auxiliary_loss_clip": 0.01156743, + "auxiliary_loss_mlp": 0.01024812, + "balance_loss_clip": 1.04839015, + "balance_loss_mlp": 1.01742077, + "epoch": 0.7863885047796549, + "flos": 47551508259840.0, + "grad_norm": 2.340499658063534, + "language_loss": 0.71783292, + "learning_rate": 4.5970864255403883e-07, + "loss": 0.73964846, + "num_input_tokens_seen": 140966660, + "step": 6540, + "time_per_iteration": 2.655489444732666 + }, + { + "auxiliary_loss_clip": 0.01140207, + "auxiliary_loss_mlp": 0.01022249, + "balance_loss_clip": 1.04365444, + "balance_loss_mlp": 1.01574016, + "epoch": 0.786508747670294, + "flos": 24389234979840.0, + "grad_norm": 1.8667333097532215, + "language_loss": 0.81985426, + "learning_rate": 4.59211878280982e-07, + "loss": 0.84147882, + "num_input_tokens_seen": 140986175, + "step": 6541, + "time_per_iteration": 2.4602620601654053 + }, + { + "auxiliary_loss_clip": 0.01138426, + "auxiliary_loss_mlp": 0.01021688, + "balance_loss_clip": 1.04481506, + "balance_loss_mlp": 1.0146513, + "epoch": 0.786628990560933, + "flos": 18041234238720.0, + "grad_norm": 2.638287957537487, + "language_loss": 0.70001912, + "learning_rate": 4.587153477442578e-07, + "loss": 0.72162032, + "num_input_tokens_seen": 141002490, + "step": 6542, + "time_per_iteration": 2.4550085067749023 + }, + { + "auxiliary_loss_clip": 0.01169613, + "auxiliary_loss_mlp": 0.01025984, + "balance_loss_clip": 1.04906929, + "balance_loss_mlp": 1.01834321, + "epoch": 0.7867492334515722, + "flos": 25848860048640.0, + "grad_norm": 3.045472342088166, + "language_loss": 0.81581938, + "learning_rate": 4.582190510191899e-07, + "loss": 0.83777535, + "num_input_tokens_seen": 141021150, + "step": 6543, + "time_per_iteration": 2.4252097606658936 + }, + { + "auxiliary_loss_clip": 0.01121494, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.04519296, + "balance_loss_mlp": 1.01729512, + "epoch": 0.7868694763422113, + "flos": 16580819070720.0, + "grad_norm": 2.0068830520879373, + "language_loss": 0.87285471, + "learning_rate": 4.5772298818106625e-07, + "loss": 0.89431179, + "num_input_tokens_seen": 141036940, + "step": 6544, + "time_per_iteration": 2.4546384811401367 + }, + { + "auxiliary_loss_clip": 0.01133911, + "auxiliary_loss_mlp": 0.01025222, + "balance_loss_clip": 1.04806626, + "balance_loss_mlp": 1.01776505, + "epoch": 0.7869897192328503, + "flos": 29386276272000.0, + "grad_norm": 2.2657619764442587, + "language_loss": 0.71594775, + "learning_rate": 4.572271593051384e-07, + "loss": 0.73753911, + "num_input_tokens_seen": 141054295, + "step": 6545, + "time_per_iteration": 2.540421962738037 + }, + { + "auxiliary_loss_clip": 0.01105133, + "auxiliary_loss_mlp": 0.0102248, + "balance_loss_clip": 1.04405975, + "balance_loss_mlp": 1.01569057, + "epoch": 0.7871099621234895, + "flos": 17128923678720.0, + "grad_norm": 1.7652684842074522, + "language_loss": 0.78288555, + "learning_rate": 4.567315644666245e-07, + "loss": 0.80416167, + "num_input_tokens_seen": 141073090, + "step": 6546, + "time_per_iteration": 2.508835554122925 + }, + { + "auxiliary_loss_clip": 0.01120778, + "auxiliary_loss_mlp": 0.01020691, + "balance_loss_clip": 1.04563427, + "balance_loss_mlp": 1.01372063, + "epoch": 0.7872302050141285, + "flos": 23440187784960.0, + "grad_norm": 1.9833915426105033, + "language_loss": 0.84761536, + "learning_rate": 4.5623620374070507e-07, + "loss": 0.86903006, + "num_input_tokens_seen": 141092405, + "step": 6547, + "time_per_iteration": 2.4985992908477783 + }, + { + "auxiliary_loss_clip": 0.01032081, + "auxiliary_loss_mlp": 0.01001732, + "balance_loss_clip": 1.00973082, + "balance_loss_mlp": 1.00065935, + "epoch": 0.7873504479047676, + "flos": 65959752689280.0, + "grad_norm": 0.7646426487743536, + "language_loss": 0.58440018, + "learning_rate": 4.557410772025263e-07, + "loss": 0.60473835, + "num_input_tokens_seen": 141154355, + "step": 6548, + "time_per_iteration": 3.1714861392974854 + }, + { + "auxiliary_loss_clip": 0.0113385, + "auxiliary_loss_mlp": 0.01026978, + "balance_loss_clip": 1.04336679, + "balance_loss_mlp": 1.01981091, + "epoch": 0.7874706907954068, + "flos": 23258336204160.0, + "grad_norm": 2.0820165254770107, + "language_loss": 0.66226792, + "learning_rate": 4.5524618492719803e-07, + "loss": 0.68387616, + "num_input_tokens_seen": 141173575, + "step": 6549, + "time_per_iteration": 3.360734462738037 + }, + { + "auxiliary_loss_clip": 0.01151419, + "auxiliary_loss_mlp": 0.01022071, + "balance_loss_clip": 1.04533255, + "balance_loss_mlp": 1.01524878, + "epoch": 0.7875909336860458, + "flos": 28767786963840.0, + "grad_norm": 1.6711485690248198, + "language_loss": 0.79356635, + "learning_rate": 4.54751526989795e-07, + "loss": 0.8153013, + "num_input_tokens_seen": 141195415, + "step": 6550, + "time_per_iteration": 3.2458090782165527 + }, + { + "auxiliary_loss_clip": 0.01154321, + "auxiliary_loss_mlp": 0.01024548, + "balance_loss_clip": 1.04614449, + "balance_loss_mlp": 1.01723158, + "epoch": 0.7877111765766849, + "flos": 18697286194560.0, + "grad_norm": 2.0306043773456373, + "language_loss": 0.79386109, + "learning_rate": 4.5425710346535775e-07, + "loss": 0.81564975, + "num_input_tokens_seen": 141213360, + "step": 6551, + "time_per_iteration": 2.397141456604004 + }, + { + "auxiliary_loss_clip": 0.01153103, + "auxiliary_loss_mlp": 0.01024801, + "balance_loss_clip": 1.04562199, + "balance_loss_mlp": 1.01686788, + "epoch": 0.787831419467324, + "flos": 27592968833280.0, + "grad_norm": 1.9140507309289247, + "language_loss": 0.81705105, + "learning_rate": 4.537629144288877e-07, + "loss": 0.83883017, + "num_input_tokens_seen": 141230815, + "step": 6552, + "time_per_iteration": 3.270242929458618 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.01025015, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.01798737, + "epoch": 0.7879516623579631, + "flos": 18150187167360.0, + "grad_norm": 2.170461918890001, + "language_loss": 0.75071132, + "learning_rate": 4.5326895995535477e-07, + "loss": 0.77212059, + "num_input_tokens_seen": 141249715, + "step": 6553, + "time_per_iteration": 2.5500857830047607 + }, + { + "auxiliary_loss_clip": 0.01147228, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.0442307, + "balance_loss_mlp": 1.01643324, + "epoch": 0.7880719052486022, + "flos": 20339193807360.0, + "grad_norm": 2.3007570624229223, + "language_loss": 0.84597278, + "learning_rate": 4.527752401196907e-07, + "loss": 0.86768031, + "num_input_tokens_seen": 141267730, + "step": 6554, + "time_per_iteration": 2.426267385482788 + }, + { + "auxiliary_loss_clip": 0.01133772, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.04474401, + "balance_loss_mlp": 1.01949465, + "epoch": 0.7881921481392413, + "flos": 21653237053440.0, + "grad_norm": 1.7210816373088333, + "language_loss": 0.66440421, + "learning_rate": 4.5228175499679254e-07, + "loss": 0.6860112, + "num_input_tokens_seen": 141287315, + "step": 6555, + "time_per_iteration": 3.2407355308532715 + }, + { + "auxiliary_loss_clip": 0.01055194, + "auxiliary_loss_mlp": 0.01001301, + "balance_loss_clip": 1.01077652, + "balance_loss_mlp": 1.00024033, + "epoch": 0.7883123910298804, + "flos": 68565860058240.0, + "grad_norm": 0.8151351645725482, + "language_loss": 0.54515839, + "learning_rate": 4.5178850466152174e-07, + "loss": 0.5657233, + "num_input_tokens_seen": 141346145, + "step": 6556, + "time_per_iteration": 3.0649099349975586 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01023415, + "balance_loss_clip": 1.04187489, + "balance_loss_mlp": 1.01644158, + "epoch": 0.7884326339205194, + "flos": 19318217627520.0, + "grad_norm": 1.7963096579371622, + "language_loss": 0.81914318, + "learning_rate": 4.512954891887031e-07, + "loss": 0.84070504, + "num_input_tokens_seen": 141364445, + "step": 6557, + "time_per_iteration": 2.4567461013793945 + }, + { + "auxiliary_loss_clip": 0.01136199, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.04742813, + "balance_loss_mlp": 1.02181304, + "epoch": 0.7885528768111585, + "flos": 17784903807360.0, + "grad_norm": 2.2733152396122227, + "language_loss": 0.83459806, + "learning_rate": 4.5080270865312806e-07, + "loss": 0.85625732, + "num_input_tokens_seen": 141381640, + "step": 6558, + "time_per_iteration": 2.43115496635437 + }, + { + "auxiliary_loss_clip": 0.01150972, + "auxiliary_loss_mlp": 0.01022017, + "balance_loss_clip": 1.04605997, + "balance_loss_mlp": 1.01529968, + "epoch": 0.7886731197017977, + "flos": 18807639753600.0, + "grad_norm": 2.0857913693067376, + "language_loss": 0.71319795, + "learning_rate": 4.5031016312954985e-07, + "loss": 0.73492777, + "num_input_tokens_seen": 141399955, + "step": 6559, + "time_per_iteration": 2.4247934818267822 + }, + { + "auxiliary_loss_clip": 0.01161901, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.04975438, + "balance_loss_mlp": 1.0168258, + "epoch": 0.7887933625924367, + "flos": 33365358126720.0, + "grad_norm": 4.1630067051630775, + "language_loss": 0.74909467, + "learning_rate": 4.498178526926886e-07, + "loss": 0.77095181, + "num_input_tokens_seen": 141420820, + "step": 6560, + "time_per_iteration": 2.553723096847534 + }, + { + "auxiliary_loss_clip": 0.0116526, + "auxiliary_loss_mlp": 0.01027075, + "balance_loss_clip": 1.04810011, + "balance_loss_mlp": 1.02041388, + "epoch": 0.7889136054830758, + "flos": 17019360218880.0, + "grad_norm": 2.758014043534276, + "language_loss": 0.72061914, + "learning_rate": 4.4932577741722635e-07, + "loss": 0.74254251, + "num_input_tokens_seen": 141439350, + "step": 6561, + "time_per_iteration": 2.380889654159546 + }, + { + "auxiliary_loss_clip": 0.01134743, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.04419732, + "balance_loss_mlp": 1.02115107, + "epoch": 0.7890338483737149, + "flos": 29424629018880.0, + "grad_norm": 1.9559362193656162, + "language_loss": 0.74352396, + "learning_rate": 4.4883393737780985e-07, + "loss": 0.76516044, + "num_input_tokens_seen": 141460300, + "step": 6562, + "time_per_iteration": 2.524661064147949 + }, + { + "auxiliary_loss_clip": 0.01146446, + "auxiliary_loss_mlp": 0.01024985, + "balance_loss_clip": 1.04456294, + "balance_loss_mlp": 1.01792789, + "epoch": 0.789154091264354, + "flos": 19971576063360.0, + "grad_norm": 1.922886281588387, + "language_loss": 0.7861793, + "learning_rate": 4.4834233264905254e-07, + "loss": 0.80789363, + "num_input_tokens_seen": 141477315, + "step": 6563, + "time_per_iteration": 2.416424512863159 + }, + { + "auxiliary_loss_clip": 0.01116282, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.04081166, + "balance_loss_mlp": 1.0210191, + "epoch": 0.789274334154993, + "flos": 14537825216640.0, + "grad_norm": 2.6226564838199424, + "language_loss": 0.7143079, + "learning_rate": 4.478509633055294e-07, + "loss": 0.73575538, + "num_input_tokens_seen": 141495025, + "step": 6564, + "time_per_iteration": 2.471700668334961 + }, + { + "auxiliary_loss_clip": 0.01140841, + "auxiliary_loss_mlp": 0.01028025, + "balance_loss_clip": 1.04515147, + "balance_loss_mlp": 1.0203836, + "epoch": 0.7893945770456322, + "flos": 21827403123840.0, + "grad_norm": 3.3680339608858048, + "language_loss": 0.79994029, + "learning_rate": 4.473598294217813e-07, + "loss": 0.82162899, + "num_input_tokens_seen": 141510450, + "step": 6565, + "time_per_iteration": 2.4578559398651123 + }, + { + "auxiliary_loss_clip": 0.0115129, + "auxiliary_loss_mlp": 0.01022398, + "balance_loss_clip": 1.04835749, + "balance_loss_mlp": 1.01551068, + "epoch": 0.7895148199362713, + "flos": 20740639184640.0, + "grad_norm": 2.0752877753709598, + "language_loss": 0.71948761, + "learning_rate": 4.468689310723124e-07, + "loss": 0.74122441, + "num_input_tokens_seen": 141528265, + "step": 6566, + "time_per_iteration": 2.4193062782287598 + }, + { + "auxiliary_loss_clip": 0.01127121, + "auxiliary_loss_mlp": 0.01024212, + "balance_loss_clip": 1.04337358, + "balance_loss_mlp": 1.01701736, + "epoch": 0.7896350628269103, + "flos": 16690669839360.0, + "grad_norm": 1.68254047823207, + "language_loss": 0.78595316, + "learning_rate": 4.463782683315913e-07, + "loss": 0.80746651, + "num_input_tokens_seen": 141547270, + "step": 6567, + "time_per_iteration": 2.4683709144592285 + }, + { + "auxiliary_loss_clip": 0.0116324, + "auxiliary_loss_mlp": 0.0102462, + "balance_loss_clip": 1.04682946, + "balance_loss_mlp": 1.01782513, + "epoch": 0.7897553057175495, + "flos": 22638374438400.0, + "grad_norm": 1.7791749198296285, + "language_loss": 0.73169696, + "learning_rate": 4.458878412740523e-07, + "loss": 0.75357556, + "num_input_tokens_seen": 141566050, + "step": 6568, + "time_per_iteration": 2.400585651397705 + }, + { + "auxiliary_loss_clip": 0.01147715, + "auxiliary_loss_mlp": 0.01023001, + "balance_loss_clip": 1.04556549, + "balance_loss_mlp": 1.0158428, + "epoch": 0.7898755486081885, + "flos": 14537573821440.0, + "grad_norm": 2.809623088500836, + "language_loss": 0.77538687, + "learning_rate": 4.453976499740919e-07, + "loss": 0.79709399, + "num_input_tokens_seen": 141583695, + "step": 6569, + "time_per_iteration": 2.3946003913879395 + }, + { + "auxiliary_loss_clip": 0.01148466, + "auxiliary_loss_mlp": 0.01021776, + "balance_loss_clip": 1.04703081, + "balance_loss_mlp": 1.01494503, + "epoch": 0.7899957914988276, + "flos": 17238487138560.0, + "grad_norm": 1.6439343101453636, + "language_loss": 0.77846432, + "learning_rate": 4.4490769450607215e-07, + "loss": 0.80016673, + "num_input_tokens_seen": 141601320, + "step": 6570, + "time_per_iteration": 2.4071273803710938 + }, + { + "auxiliary_loss_clip": 0.01120218, + "auxiliary_loss_mlp": 0.0102223, + "balance_loss_clip": 1.03963912, + "balance_loss_mlp": 1.01531291, + "epoch": 0.7901160343894668, + "flos": 41279351086080.0, + "grad_norm": 1.8300896281009391, + "language_loss": 0.72620595, + "learning_rate": 4.4441797494431845e-07, + "loss": 0.74763042, + "num_input_tokens_seen": 141623125, + "step": 6571, + "time_per_iteration": 2.662055253982544 + }, + { + "auxiliary_loss_clip": 0.01149012, + "auxiliary_loss_mlp": 0.01024046, + "balance_loss_clip": 1.04602754, + "balance_loss_mlp": 1.01655662, + "epoch": 0.7902362772801058, + "flos": 16837005847680.0, + "grad_norm": 1.9374060795423802, + "language_loss": 0.7782433, + "learning_rate": 4.439284913631207e-07, + "loss": 0.79997391, + "num_input_tokens_seen": 141640335, + "step": 6572, + "time_per_iteration": 2.397940158843994 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.04582214, + "balance_loss_mlp": 1.02207732, + "epoch": 0.7903565201707449, + "flos": 27125987091840.0, + "grad_norm": 2.702596699673583, + "language_loss": 0.83565795, + "learning_rate": 4.434392438367347e-07, + "loss": 0.8572278, + "num_input_tokens_seen": 141659760, + "step": 6573, + "time_per_iteration": 2.531533718109131 + }, + { + "auxiliary_loss_clip": 0.0115622, + "auxiliary_loss_mlp": 0.0102143, + "balance_loss_clip": 1.04528213, + "balance_loss_mlp": 1.01421762, + "epoch": 0.790476763061384, + "flos": 31025167142400.0, + "grad_norm": 2.966522421635397, + "language_loss": 0.74172604, + "learning_rate": 4.4295023243937677e-07, + "loss": 0.76350248, + "num_input_tokens_seen": 141679965, + "step": 6574, + "time_per_iteration": 2.5097243785858154 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.05093598, + "balance_loss_mlp": 1.01953292, + "epoch": 0.7905970059520231, + "flos": 22089084681600.0, + "grad_norm": 1.7204156251130005, + "language_loss": 0.80530035, + "learning_rate": 4.4246145724523123e-07, + "loss": 0.82714558, + "num_input_tokens_seen": 141697710, + "step": 6575, + "time_per_iteration": 2.4417030811309814 + }, + { + "auxiliary_loss_clip": 0.01122534, + "auxiliary_loss_mlp": 0.01023488, + "balance_loss_clip": 1.04502881, + "balance_loss_mlp": 1.01640725, + "epoch": 0.7907172488426621, + "flos": 20558141159040.0, + "grad_norm": 2.096361457670146, + "language_loss": 0.77651459, + "learning_rate": 4.41972918328444e-07, + "loss": 0.79797482, + "num_input_tokens_seen": 141715145, + "step": 6576, + "time_per_iteration": 3.3165934085845947 + }, + { + "auxiliary_loss_clip": 0.01149275, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0459249, + "balance_loss_mlp": 1.02227449, + "epoch": 0.7908374917333013, + "flos": 30081542901120.0, + "grad_norm": 3.214441896016704, + "language_loss": 0.7731806, + "learning_rate": 4.4148461576312646e-07, + "loss": 0.79496992, + "num_input_tokens_seen": 141734810, + "step": 6577, + "time_per_iteration": 3.2404873371124268 + }, + { + "auxiliary_loss_clip": 0.01153834, + "auxiliary_loss_mlp": 0.01022512, + "balance_loss_clip": 1.0492245, + "balance_loss_mlp": 1.01559448, + "epoch": 0.7909577346239404, + "flos": 20996359084800.0, + "grad_norm": 1.3785762635557686, + "language_loss": 0.74699754, + "learning_rate": 4.4099654962335343e-07, + "loss": 0.76876098, + "num_input_tokens_seen": 141755260, + "step": 6578, + "time_per_iteration": 3.3011975288391113 + }, + { + "auxiliary_loss_clip": 0.01145444, + "auxiliary_loss_mlp": 0.0102379, + "balance_loss_clip": 1.04795718, + "balance_loss_mlp": 1.01679218, + "epoch": 0.7910779775145794, + "flos": 26247935128320.0, + "grad_norm": 1.8128132770653769, + "language_loss": 0.75138372, + "learning_rate": 4.405087199831636e-07, + "loss": 0.77307606, + "num_input_tokens_seen": 141775500, + "step": 6579, + "time_per_iteration": 2.5055079460144043 + }, + { + "auxiliary_loss_clip": 0.01140659, + "auxiliary_loss_mlp": 0.00761408, + "balance_loss_clip": 1.04465985, + "balance_loss_mlp": 1.00035143, + "epoch": 0.7911982204052186, + "flos": 22564434291840.0, + "grad_norm": 2.102353587808442, + "language_loss": 0.67287648, + "learning_rate": 4.400211269165619e-07, + "loss": 0.69189715, + "num_input_tokens_seen": 141791955, + "step": 6580, + "time_per_iteration": 2.4734010696411133 + }, + { + "auxiliary_loss_clip": 0.01171218, + "auxiliary_loss_mlp": 0.01023561, + "balance_loss_clip": 1.05281544, + "balance_loss_mlp": 1.01724887, + "epoch": 0.7913184632958576, + "flos": 23112538899840.0, + "grad_norm": 1.4725045682308275, + "language_loss": 0.7670927, + "learning_rate": 4.3953377049751416e-07, + "loss": 0.78904045, + "num_input_tokens_seen": 141812380, + "step": 6581, + "time_per_iteration": 3.1848576068878174 + }, + { + "auxiliary_loss_clip": 0.01143829, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.04724455, + "balance_loss_mlp": 1.01895678, + "epoch": 0.7914387061864967, + "flos": 12311758719360.0, + "grad_norm": 2.777868849081666, + "language_loss": 0.77703995, + "learning_rate": 4.390466507999537e-07, + "loss": 0.79873765, + "num_input_tokens_seen": 141828130, + "step": 6582, + "time_per_iteration": 2.41615629196167 + }, + { + "auxiliary_loss_clip": 0.01120402, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.04160357, + "balance_loss_mlp": 1.0213058, + "epoch": 0.7915589490771359, + "flos": 17603267708160.0, + "grad_norm": 2.2040057104361357, + "language_loss": 0.75313091, + "learning_rate": 4.385597678977748e-07, + "loss": 0.7746172, + "num_input_tokens_seen": 141846965, + "step": 6583, + "time_per_iteration": 2.477984666824341 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01021882, + "balance_loss_clip": 1.04486108, + "balance_loss_mlp": 1.01464927, + "epoch": 0.7916791919677749, + "flos": 25591272641280.0, + "grad_norm": 1.8946348671589661, + "language_loss": 0.75569862, + "learning_rate": 4.3807312186483726e-07, + "loss": 0.77730131, + "num_input_tokens_seen": 141867685, + "step": 6584, + "time_per_iteration": 2.496858596801758 + }, + { + "auxiliary_loss_clip": 0.0114917, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.04819489, + "balance_loss_mlp": 1.01640296, + "epoch": 0.791799434858414, + "flos": 18844340474880.0, + "grad_norm": 2.648147981521643, + "language_loss": 0.78432012, + "learning_rate": 4.375867127749655e-07, + "loss": 0.80604827, + "num_input_tokens_seen": 141885960, + "step": 6585, + "time_per_iteration": 2.4113566875457764 + }, + { + "auxiliary_loss_clip": 0.01128374, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.04844689, + "balance_loss_mlp": 1.01781094, + "epoch": 0.7919196777490531, + "flos": 25812015672960.0, + "grad_norm": 2.411423776021973, + "language_loss": 0.6719209, + "learning_rate": 4.3710054070194744e-07, + "loss": 0.69345629, + "num_input_tokens_seen": 141905655, + "step": 6586, + "time_per_iteration": 2.531641721725464 + }, + { + "auxiliary_loss_clip": 0.01168053, + "auxiliary_loss_mlp": 0.00761989, + "balance_loss_clip": 1.04852152, + "balance_loss_mlp": 1.0004313, + "epoch": 0.7920399206396922, + "flos": 11947624594560.0, + "grad_norm": 2.8417938874174347, + "language_loss": 0.66800284, + "learning_rate": 4.3661460571953455e-07, + "loss": 0.68730319, + "num_input_tokens_seen": 141922390, + "step": 6587, + "time_per_iteration": 2.3614377975463867 + }, + { + "auxiliary_loss_clip": 0.011509, + "auxiliary_loss_mlp": 0.01020981, + "balance_loss_clip": 1.04291511, + "balance_loss_mlp": 1.01420724, + "epoch": 0.7921601635303313, + "flos": 21579907438080.0, + "grad_norm": 1.5125766647095769, + "language_loss": 0.68470734, + "learning_rate": 4.36128907901443e-07, + "loss": 0.70642608, + "num_input_tokens_seen": 141941985, + "step": 6588, + "time_per_iteration": 2.4317331314086914 + }, + { + "auxiliary_loss_clip": 0.0112554, + "auxiliary_loss_mlp": 0.0102264, + "balance_loss_clip": 1.04207182, + "balance_loss_mlp": 1.01519573, + "epoch": 0.7922804064209703, + "flos": 18113989236480.0, + "grad_norm": 3.0670378583880904, + "language_loss": 0.72801197, + "learning_rate": 4.356434473213519e-07, + "loss": 0.74949372, + "num_input_tokens_seen": 141959435, + "step": 6589, + "time_per_iteration": 2.463629961013794 + }, + { + "auxiliary_loss_clip": 0.01135478, + "auxiliary_loss_mlp": 0.01023051, + "balance_loss_clip": 1.04511428, + "balance_loss_mlp": 1.01650667, + "epoch": 0.7924006493116095, + "flos": 21652806090240.0, + "grad_norm": 1.6623448842462079, + "language_loss": 0.79864168, + "learning_rate": 4.351582240529068e-07, + "loss": 0.82022691, + "num_input_tokens_seen": 141980265, + "step": 6590, + "time_per_iteration": 2.4719135761260986 + }, + { + "auxiliary_loss_clip": 0.01048226, + "auxiliary_loss_mlp": 0.0100223, + "balance_loss_clip": 1.01133204, + "balance_loss_mlp": 1.00105023, + "epoch": 0.7925208922022485, + "flos": 64242755694720.0, + "grad_norm": 0.8190078717215722, + "language_loss": 0.58210039, + "learning_rate": 4.346732381697149e-07, + "loss": 0.60260493, + "num_input_tokens_seen": 142044395, + "step": 6591, + "time_per_iteration": 3.0929298400878906 + }, + { + "auxiliary_loss_clip": 0.01131311, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.0438385, + "balance_loss_mlp": 1.01848435, + "epoch": 0.7926411350928876, + "flos": 16941541403520.0, + "grad_norm": 2.0617446319843036, + "language_loss": 0.81260121, + "learning_rate": 4.3418848974534825e-07, + "loss": 0.83417273, + "num_input_tokens_seen": 142061335, + "step": 6592, + "time_per_iteration": 2.4242377281188965 + }, + { + "auxiliary_loss_clip": 0.01129228, + "auxiliary_loss_mlp": 0.01023377, + "balance_loss_clip": 1.04533362, + "balance_loss_mlp": 1.01690149, + "epoch": 0.7927613779835267, + "flos": 34459987144320.0, + "grad_norm": 1.7948372383114815, + "language_loss": 0.68892765, + "learning_rate": 4.3370397885334276e-07, + "loss": 0.71045375, + "num_input_tokens_seen": 142081965, + "step": 6593, + "time_per_iteration": 2.5983049869537354 + }, + { + "auxiliary_loss_clip": 0.01147252, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.04742765, + "balance_loss_mlp": 1.02138102, + "epoch": 0.7928816208741658, + "flos": 18951174501120.0, + "grad_norm": 1.8713264344573266, + "language_loss": 0.75638378, + "learning_rate": 4.3321970556719777e-07, + "loss": 0.77814162, + "num_input_tokens_seen": 142100260, + "step": 6594, + "time_per_iteration": 2.4202566146850586 + }, + { + "auxiliary_loss_clip": 0.01166175, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.04855394, + "balance_loss_mlp": 1.01957309, + "epoch": 0.7930018637648049, + "flos": 18623022825600.0, + "grad_norm": 2.6766589335515127, + "language_loss": 0.718279, + "learning_rate": 4.3273566996037856e-07, + "loss": 0.74021089, + "num_input_tokens_seen": 142116955, + "step": 6595, + "time_per_iteration": 2.3635802268981934 + }, + { + "auxiliary_loss_clip": 0.01137154, + "auxiliary_loss_mlp": 0.01025349, + "balance_loss_clip": 1.04545259, + "balance_loss_mlp": 1.01849699, + "epoch": 0.793122106655444, + "flos": 24530650824960.0, + "grad_norm": 1.9060626493223316, + "language_loss": 0.80084467, + "learning_rate": 4.322518721063113e-07, + "loss": 0.82246965, + "num_input_tokens_seen": 142135505, + "step": 6596, + "time_per_iteration": 2.4905996322631836 + }, + { + "auxiliary_loss_clip": 0.01152518, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.04818416, + "balance_loss_mlp": 1.01936543, + "epoch": 0.7932423495460831, + "flos": 34421203434240.0, + "grad_norm": 2.032777346926402, + "language_loss": 0.70500243, + "learning_rate": 4.3176831207838906e-07, + "loss": 0.72679299, + "num_input_tokens_seen": 142158915, + "step": 6597, + "time_per_iteration": 2.5546207427978516 + }, + { + "auxiliary_loss_clip": 0.01151181, + "auxiliary_loss_mlp": 0.01022238, + "balance_loss_clip": 1.04924536, + "balance_loss_mlp": 1.01523197, + "epoch": 0.7933625924367221, + "flos": 26980333441920.0, + "grad_norm": 2.064778182947546, + "language_loss": 0.74736714, + "learning_rate": 4.3128498994996685e-07, + "loss": 0.76910138, + "num_input_tokens_seen": 142178390, + "step": 6598, + "time_per_iteration": 2.48626971244812 + }, + { + "auxiliary_loss_clip": 0.0115717, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.04825103, + "balance_loss_mlp": 1.01898718, + "epoch": 0.7934828353273613, + "flos": 29568630643200.0, + "grad_norm": 2.188865164993516, + "language_loss": 0.71513712, + "learning_rate": 4.308019057943646e-07, + "loss": 0.73697561, + "num_input_tokens_seen": 142200115, + "step": 6599, + "time_per_iteration": 2.5224437713623047 + }, + { + "auxiliary_loss_clip": 0.0111875, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.0437125, + "balance_loss_mlp": 1.02027774, + "epoch": 0.7936030782180004, + "flos": 28615381557120.0, + "grad_norm": 1.566567151418977, + "language_loss": 0.74606913, + "learning_rate": 4.3031905968486535e-07, + "loss": 0.76752687, + "num_input_tokens_seen": 142220945, + "step": 6600, + "time_per_iteration": 2.617225408554077 + }, + { + "auxiliary_loss_clip": 0.01106504, + "auxiliary_loss_mlp": 0.01022823, + "balance_loss_clip": 1.04422569, + "balance_loss_mlp": 1.01592636, + "epoch": 0.7937233211086394, + "flos": 16392574869120.0, + "grad_norm": 2.4639726675608062, + "language_loss": 0.68404102, + "learning_rate": 4.298364516947162e-07, + "loss": 0.70533431, + "num_input_tokens_seen": 142238175, + "step": 6601, + "time_per_iteration": 2.509373903274536 + }, + { + "auxiliary_loss_clip": 0.01105193, + "auxiliary_loss_mlp": 0.01021577, + "balance_loss_clip": 1.0403955, + "balance_loss_mlp": 1.01476693, + "epoch": 0.7938435639992786, + "flos": 22013420682240.0, + "grad_norm": 1.8733447522045494, + "language_loss": 0.65741765, + "learning_rate": 4.293540818971295e-07, + "loss": 0.67868537, + "num_input_tokens_seen": 142255980, + "step": 6602, + "time_per_iteration": 3.3912975788116455 + }, + { + "auxiliary_loss_clip": 0.01156994, + "auxiliary_loss_mlp": 0.01018569, + "balance_loss_clip": 1.04711223, + "balance_loss_mlp": 1.01138937, + "epoch": 0.7939638068899176, + "flos": 22197032029440.0, + "grad_norm": 2.044910526542651, + "language_loss": 0.76782644, + "learning_rate": 4.2887195036527934e-07, + "loss": 0.78958201, + "num_input_tokens_seen": 142274785, + "step": 6603, + "time_per_iteration": 3.1848556995391846 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01022265, + "balance_loss_clip": 1.04158592, + "balance_loss_mlp": 1.01513028, + "epoch": 0.7940840497805567, + "flos": 17745186343680.0, + "grad_norm": 3.092064818015703, + "language_loss": 0.73251331, + "learning_rate": 4.28390057172306e-07, + "loss": 0.75414705, + "num_input_tokens_seen": 142291290, + "step": 6604, + "time_per_iteration": 2.3862674236297607 + }, + { + "auxiliary_loss_clip": 0.01118427, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.04036212, + "balance_loss_mlp": 1.01908207, + "epoch": 0.7942042926711959, + "flos": 23805435231360.0, + "grad_norm": 2.348865616828976, + "language_loss": 0.72159493, + "learning_rate": 4.279084023913111e-07, + "loss": 0.74304724, + "num_input_tokens_seen": 142309165, + "step": 6605, + "time_per_iteration": 3.351757049560547 + }, + { + "auxiliary_loss_clip": 0.01151842, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.04752648, + "balance_loss_mlp": 1.01621902, + "epoch": 0.7943245355618349, + "flos": 19244959839360.0, + "grad_norm": 1.8509945064574276, + "language_loss": 0.69608033, + "learning_rate": 4.2742698609536096e-07, + "loss": 0.71783102, + "num_input_tokens_seen": 142327475, + "step": 6606, + "time_per_iteration": 2.411729574203491 + }, + { + "auxiliary_loss_clip": 0.01141759, + "auxiliary_loss_mlp": 0.01026158, + "balance_loss_clip": 1.04604197, + "balance_loss_mlp": 1.01916051, + "epoch": 0.794444778452474, + "flos": 25007616547200.0, + "grad_norm": 1.8525007965634, + "language_loss": 0.78784835, + "learning_rate": 4.2694580835748706e-07, + "loss": 0.80952752, + "num_input_tokens_seen": 142347335, + "step": 6607, + "time_per_iteration": 2.507347583770752 + }, + { + "auxiliary_loss_clip": 0.01137214, + "auxiliary_loss_mlp": 0.01024264, + "balance_loss_clip": 1.04394722, + "balance_loss_mlp": 1.01677167, + "epoch": 0.7945650213431131, + "flos": 23221491828480.0, + "grad_norm": 2.2480442197891364, + "language_loss": 0.74014509, + "learning_rate": 4.264648692506836e-07, + "loss": 0.76175988, + "num_input_tokens_seen": 142366125, + "step": 6608, + "time_per_iteration": 3.192314624786377 + }, + { + "auxiliary_loss_clip": 0.01130992, + "auxiliary_loss_mlp": 0.01025439, + "balance_loss_clip": 1.04236984, + "balance_loss_mlp": 1.01782703, + "epoch": 0.7946852642337522, + "flos": 26062887237120.0, + "grad_norm": 1.8070844801424428, + "language_loss": 0.72065431, + "learning_rate": 4.2598416884790824e-07, + "loss": 0.74221861, + "num_input_tokens_seen": 142385175, + "step": 6609, + "time_per_iteration": 2.48479962348938 + }, + { + "auxiliary_loss_clip": 0.0114658, + "auxiliary_loss_mlp": 0.01022373, + "balance_loss_clip": 1.04435027, + "balance_loss_mlp": 1.0144453, + "epoch": 0.7948055071243912, + "flos": 23769704177280.0, + "grad_norm": 1.8588221875126303, + "language_loss": 0.81020772, + "learning_rate": 4.255037072220828e-07, + "loss": 0.83189726, + "num_input_tokens_seen": 142406545, + "step": 6610, + "time_per_iteration": 2.4791934490203857 + }, + { + "auxiliary_loss_clip": 0.01163863, + "auxiliary_loss_mlp": 0.01021872, + "balance_loss_clip": 1.047575, + "balance_loss_mlp": 1.01540196, + "epoch": 0.7949257500150304, + "flos": 21980814111360.0, + "grad_norm": 1.6322089106261355, + "language_loss": 0.72071111, + "learning_rate": 4.2502348444609293e-07, + "loss": 0.74256849, + "num_input_tokens_seen": 142426165, + "step": 6611, + "time_per_iteration": 2.393824338912964 + }, + { + "auxiliary_loss_clip": 0.01107092, + "auxiliary_loss_mlp": 0.01024829, + "balance_loss_clip": 1.03946376, + "balance_loss_mlp": 1.01811194, + "epoch": 0.7950459929056695, + "flos": 25774129802880.0, + "grad_norm": 2.0735636942778717, + "language_loss": 0.69263506, + "learning_rate": 4.2454350059278844e-07, + "loss": 0.71395433, + "num_input_tokens_seen": 142447225, + "step": 6612, + "time_per_iteration": 2.576458692550659 + }, + { + "auxiliary_loss_clip": 0.0113145, + "auxiliary_loss_mlp": 0.01022896, + "balance_loss_clip": 1.04071593, + "balance_loss_mlp": 1.01608574, + "epoch": 0.7951662357963085, + "flos": 22158068751360.0, + "grad_norm": 1.8668637044677727, + "language_loss": 0.84584939, + "learning_rate": 4.240637557349824e-07, + "loss": 0.8673929, + "num_input_tokens_seen": 142464440, + "step": 6613, + "time_per_iteration": 2.454233169555664 + }, + { + "auxiliary_loss_clip": 0.01125005, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.04366422, + "balance_loss_mlp": 1.01713729, + "epoch": 0.7952864786869477, + "flos": 24641938137600.0, + "grad_norm": 1.9281627515450201, + "language_loss": 0.66895097, + "learning_rate": 4.235842499454516e-07, + "loss": 0.69044435, + "num_input_tokens_seen": 142484355, + "step": 6614, + "time_per_iteration": 2.4768826961517334 + }, + { + "auxiliary_loss_clip": 0.01139714, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.04618716, + "balance_loss_mlp": 1.02038097, + "epoch": 0.7954067215775867, + "flos": 21830922656640.0, + "grad_norm": 2.052894538503174, + "language_loss": 0.83102804, + "learning_rate": 4.2310498329693687e-07, + "loss": 0.85269976, + "num_input_tokens_seen": 142505255, + "step": 6615, + "time_per_iteration": 2.5034849643707275 + }, + { + "auxiliary_loss_clip": 0.01155481, + "auxiliary_loss_mlp": 0.01024506, + "balance_loss_clip": 1.04739833, + "balance_loss_mlp": 1.01675487, + "epoch": 0.7955269644682258, + "flos": 24060652341120.0, + "grad_norm": 1.8714757143023533, + "language_loss": 0.80673945, + "learning_rate": 4.2262595586214164e-07, + "loss": 0.82853931, + "num_input_tokens_seen": 142526350, + "step": 6616, + "time_per_iteration": 2.4590518474578857 + }, + { + "auxiliary_loss_clip": 0.0115553, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.04738343, + "balance_loss_mlp": 1.01852143, + "epoch": 0.795647207358865, + "flos": 25010741030400.0, + "grad_norm": 1.8369499267244507, + "language_loss": 0.76683843, + "learning_rate": 4.221471677137358e-07, + "loss": 0.78865427, + "num_input_tokens_seen": 142547165, + "step": 6617, + "time_per_iteration": 2.4743330478668213 + }, + { + "auxiliary_loss_clip": 0.01126113, + "auxiliary_loss_mlp": 0.01022762, + "balance_loss_clip": 1.04303741, + "balance_loss_mlp": 1.01626253, + "epoch": 0.795767450249504, + "flos": 14648358343680.0, + "grad_norm": 1.8995220252151008, + "language_loss": 0.70113754, + "learning_rate": 4.216686189243492e-07, + "loss": 0.72262627, + "num_input_tokens_seen": 142565955, + "step": 6618, + "time_per_iteration": 2.4523582458496094 + }, + { + "auxiliary_loss_clip": 0.01122303, + "auxiliary_loss_mlp": 0.01021656, + "balance_loss_clip": 1.04486299, + "balance_loss_mlp": 1.01412189, + "epoch": 0.7958876931401431, + "flos": 18547897530240.0, + "grad_norm": 1.8052228019774563, + "language_loss": 0.72656584, + "learning_rate": 4.211903095665785e-07, + "loss": 0.74800545, + "num_input_tokens_seen": 142585340, + "step": 6619, + "time_per_iteration": 2.480369806289673 + }, + { + "auxiliary_loss_clip": 0.01145166, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.04394758, + "balance_loss_mlp": 1.02140558, + "epoch": 0.7960079360307821, + "flos": 21543960902400.0, + "grad_norm": 1.917538964622712, + "language_loss": 0.75246668, + "learning_rate": 4.2071223971298277e-07, + "loss": 0.77420354, + "num_input_tokens_seen": 142602525, + "step": 6620, + "time_per_iteration": 2.462979555130005 + }, + { + "auxiliary_loss_clip": 0.01152625, + "auxiliary_loss_mlp": 0.01023628, + "balance_loss_clip": 1.04575241, + "balance_loss_mlp": 1.01584041, + "epoch": 0.7961281789214213, + "flos": 25481745095040.0, + "grad_norm": 2.0093183304856908, + "language_loss": 0.61201036, + "learning_rate": 4.2023440943608433e-07, + "loss": 0.63377297, + "num_input_tokens_seen": 142622490, + "step": 6621, + "time_per_iteration": 2.4907608032226562 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.0102103, + "balance_loss_clip": 1.04299212, + "balance_loss_mlp": 1.0142591, + "epoch": 0.7962484218120603, + "flos": 21944436612480.0, + "grad_norm": 1.471355302036172, + "language_loss": 0.77995121, + "learning_rate": 4.1975681880837023e-07, + "loss": 0.80164349, + "num_input_tokens_seen": 142642495, + "step": 6622, + "time_per_iteration": 2.4658966064453125 + }, + { + "auxiliary_loss_clip": 0.0111791, + "auxiliary_loss_mlp": 0.01024361, + "balance_loss_clip": 1.03971243, + "balance_loss_mlp": 1.01717281, + "epoch": 0.7963686647026994, + "flos": 18876264687360.0, + "grad_norm": 1.7483037580313359, + "language_loss": 0.82218623, + "learning_rate": 4.192794679022895e-07, + "loss": 0.84360898, + "num_input_tokens_seen": 142660820, + "step": 6623, + "time_per_iteration": 2.4928228855133057 + }, + { + "auxiliary_loss_clip": 0.01152496, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.04489589, + "balance_loss_mlp": 1.02009594, + "epoch": 0.7964889075933386, + "flos": 29716582763520.0, + "grad_norm": 1.7625971478868678, + "language_loss": 0.7220723, + "learning_rate": 4.1880235679025743e-07, + "loss": 0.74386382, + "num_input_tokens_seen": 142680915, + "step": 6624, + "time_per_iteration": 2.5325610637664795 + }, + { + "auxiliary_loss_clip": 0.01095573, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.03922272, + "balance_loss_mlp": 1.02478313, + "epoch": 0.7966091504839776, + "flos": 29491458272640.0, + "grad_norm": 1.9780163457741171, + "language_loss": 0.63591146, + "learning_rate": 4.1832548554464986e-07, + "loss": 0.65718377, + "num_input_tokens_seen": 142699210, + "step": 6625, + "time_per_iteration": 2.616722583770752 + }, + { + "auxiliary_loss_clip": 0.01048986, + "auxiliary_loss_mlp": 0.01002833, + "balance_loss_clip": 1.0090121, + "balance_loss_mlp": 1.00172412, + "epoch": 0.7967293933746167, + "flos": 67288697101440.0, + "grad_norm": 0.7447212932907569, + "language_loss": 0.58800209, + "learning_rate": 4.178488542378098e-07, + "loss": 0.60852027, + "num_input_tokens_seen": 142756790, + "step": 6626, + "time_per_iteration": 2.9727652072906494 + }, + { + "auxiliary_loss_clip": 0.01169683, + "auxiliary_loss_mlp": 0.01025511, + "balance_loss_clip": 1.04920316, + "balance_loss_mlp": 1.01832235, + "epoch": 0.7968496362652558, + "flos": 25554679660800.0, + "grad_norm": 1.6894785489034347, + "language_loss": 0.89079356, + "learning_rate": 4.173724629420401e-07, + "loss": 0.91274548, + "num_input_tokens_seen": 142778150, + "step": 6627, + "time_per_iteration": 2.4321272373199463 + }, + { + "auxiliary_loss_clip": 0.01145601, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.04684985, + "balance_loss_mlp": 1.01886463, + "epoch": 0.7969698791558949, + "flos": 14501088581760.0, + "grad_norm": 2.6422234536385565, + "language_loss": 0.68216497, + "learning_rate": 4.168963117296087e-07, + "loss": 0.70388585, + "num_input_tokens_seen": 142795485, + "step": 6628, + "time_per_iteration": 2.4320638179779053 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01021662, + "balance_loss_clip": 1.04813099, + "balance_loss_mlp": 1.01493895, + "epoch": 0.797090122046534, + "flos": 22127545169280.0, + "grad_norm": 2.09721354716106, + "language_loss": 0.76210493, + "learning_rate": 4.1642040067274876e-07, + "loss": 0.78398168, + "num_input_tokens_seen": 142815155, + "step": 6629, + "time_per_iteration": 3.2515742778778076 + }, + { + "auxiliary_loss_clip": 0.0114023, + "auxiliary_loss_mlp": 0.01019083, + "balance_loss_clip": 1.04439282, + "balance_loss_mlp": 1.01215971, + "epoch": 0.7972103649371731, + "flos": 19897671830400.0, + "grad_norm": 1.584353341051865, + "language_loss": 0.72362602, + "learning_rate": 4.1594472984365493e-07, + "loss": 0.74521911, + "num_input_tokens_seen": 142833840, + "step": 6630, + "time_per_iteration": 3.233954906463623 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.04648972, + "balance_loss_mlp": 1.02128029, + "epoch": 0.7973306078278122, + "flos": 36058621847040.0, + "grad_norm": 1.9469795785085844, + "language_loss": 0.77993846, + "learning_rate": 4.154692993144862e-07, + "loss": 0.80170268, + "num_input_tokens_seen": 142853610, + "step": 6631, + "time_per_iteration": 3.4039735794067383 + }, + { + "auxiliary_loss_clip": 0.01164139, + "auxiliary_loss_mlp": 0.00761675, + "balance_loss_clip": 1.04683375, + "balance_loss_mlp": 1.00043726, + "epoch": 0.7974508507184512, + "flos": 21360600950400.0, + "grad_norm": 2.6279990431078666, + "language_loss": 0.71698678, + "learning_rate": 4.1499410915736476e-07, + "loss": 0.73624492, + "num_input_tokens_seen": 142872540, + "step": 6632, + "time_per_iteration": 2.421588897705078 + }, + { + "auxiliary_loss_clip": 0.01058281, + "auxiliary_loss_mlp": 0.01000898, + "balance_loss_clip": 1.01310456, + "balance_loss_mlp": 0.99992037, + "epoch": 0.7975710936090904, + "flos": 68253115317120.0, + "grad_norm": 0.7752791990943658, + "language_loss": 0.6431545, + "learning_rate": 4.145191594443762e-07, + "loss": 0.6637463, + "num_input_tokens_seen": 142936895, + "step": 6633, + "time_per_iteration": 3.1609485149383545 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.04175329, + "balance_loss_mlp": 1.01909709, + "epoch": 0.7976913364997295, + "flos": 22492433479680.0, + "grad_norm": 1.6771579006444906, + "language_loss": 0.70526063, + "learning_rate": 4.140444502475713e-07, + "loss": 0.72669578, + "num_input_tokens_seen": 142956445, + "step": 6634, + "time_per_iteration": 2.549887180328369 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.04296589, + "balance_loss_mlp": 1.02203321, + "epoch": 0.7978115793903685, + "flos": 15263220378240.0, + "grad_norm": 1.7871226904047188, + "language_loss": 0.70029974, + "learning_rate": 4.1356998163896216e-07, + "loss": 0.72205341, + "num_input_tokens_seen": 142973495, + "step": 6635, + "time_per_iteration": 3.19233775138855 + }, + { + "auxiliary_loss_clip": 0.01127764, + "auxiliary_loss_mlp": 0.01023754, + "balance_loss_clip": 1.04546535, + "balance_loss_mlp": 1.01692009, + "epoch": 0.7979318222810077, + "flos": 19719232041600.0, + "grad_norm": 1.865828625478986, + "language_loss": 0.74809849, + "learning_rate": 4.130957536905255e-07, + "loss": 0.76961368, + "num_input_tokens_seen": 142991510, + "step": 6636, + "time_per_iteration": 2.5263454914093018 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.04677951, + "balance_loss_mlp": 1.0209589, + "epoch": 0.7980520651716467, + "flos": 15560273854080.0, + "grad_norm": 2.3695581695747987, + "language_loss": 0.71548969, + "learning_rate": 4.1262176647420134e-07, + "loss": 0.73724574, + "num_input_tokens_seen": 143009675, + "step": 6637, + "time_per_iteration": 2.4899373054504395 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.04762423, + "balance_loss_mlp": 1.0186677, + "epoch": 0.7981723080622858, + "flos": 22309432663680.0, + "grad_norm": 1.747200761284665, + "language_loss": 0.79858661, + "learning_rate": 4.121480200618923e-07, + "loss": 0.82029545, + "num_input_tokens_seen": 143029330, + "step": 6638, + "time_per_iteration": 2.630326986312866 + }, + { + "auxiliary_loss_clip": 0.01133709, + "auxiliary_loss_mlp": 0.01022972, + "balance_loss_clip": 1.044855, + "balance_loss_mlp": 1.01623678, + "epoch": 0.798292550952925, + "flos": 22929573997440.0, + "grad_norm": 1.673504942036836, + "language_loss": 0.80067152, + "learning_rate": 4.116745145254674e-07, + "loss": 0.82223833, + "num_input_tokens_seen": 143048865, + "step": 6639, + "time_per_iteration": 2.527531623840332 + }, + { + "auxiliary_loss_clip": 0.01039239, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.00960457, + "balance_loss_mlp": 1.00106227, + "epoch": 0.798412793843564, + "flos": 64497936890880.0, + "grad_norm": 0.7656802468105876, + "language_loss": 0.58011341, + "learning_rate": 4.1120124993675476e-07, + "loss": 0.60052645, + "num_input_tokens_seen": 143113295, + "step": 6640, + "time_per_iteration": 3.1508359909057617 + }, + { + "auxiliary_loss_clip": 0.01145018, + "auxiliary_loss_mlp": 0.01022817, + "balance_loss_clip": 1.04478443, + "balance_loss_mlp": 1.0151875, + "epoch": 0.7985330367342031, + "flos": 13586910514560.0, + "grad_norm": 2.1090898927963853, + "language_loss": 0.61771172, + "learning_rate": 4.107282263675498e-07, + "loss": 0.63939011, + "num_input_tokens_seen": 143130965, + "step": 6641, + "time_per_iteration": 2.501239538192749 + }, + { + "auxiliary_loss_clip": 0.01039931, + "auxiliary_loss_mlp": 0.00752985, + "balance_loss_clip": 1.01194334, + "balance_loss_mlp": 0.99999315, + "epoch": 0.7986532796248422, + "flos": 67698797656320.0, + "grad_norm": 0.7910024269912419, + "language_loss": 0.52526909, + "learning_rate": 4.1025544388960907e-07, + "loss": 0.54319823, + "num_input_tokens_seen": 143192005, + "step": 6642, + "time_per_iteration": 3.082073450088501 + }, + { + "auxiliary_loss_clip": 0.01151596, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.04854918, + "balance_loss_mlp": 1.01911342, + "epoch": 0.7987735225154813, + "flos": 22455373622400.0, + "grad_norm": 1.7941724437790638, + "language_loss": 0.7171185, + "learning_rate": 4.097829025746538e-07, + "loss": 0.73889667, + "num_input_tokens_seen": 143213550, + "step": 6643, + "time_per_iteration": 2.5058670043945312 + }, + { + "auxiliary_loss_clip": 0.01054208, + "auxiliary_loss_mlp": 0.01003208, + "balance_loss_clip": 1.01105118, + "balance_loss_mlp": 1.00221813, + "epoch": 0.7988937654061203, + "flos": 68864098682880.0, + "grad_norm": 0.657115815200203, + "language_loss": 0.60978001, + "learning_rate": 4.0931060249436757e-07, + "loss": 0.63035417, + "num_input_tokens_seen": 143277390, + "step": 6644, + "time_per_iteration": 3.07392954826355 + }, + { + "auxiliary_loss_clip": 0.0114965, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.04651618, + "balance_loss_mlp": 1.0229193, + "epoch": 0.7990140082967595, + "flos": 20806893820800.0, + "grad_norm": 1.9034701140918886, + "language_loss": 0.69732344, + "learning_rate": 4.088385437203978e-07, + "loss": 0.71912265, + "num_input_tokens_seen": 143294400, + "step": 6645, + "time_per_iteration": 2.4290459156036377 + }, + { + "auxiliary_loss_clip": 0.01165878, + "auxiliary_loss_mlp": 0.01023936, + "balance_loss_clip": 1.04676008, + "balance_loss_mlp": 1.01672649, + "epoch": 0.7991342511873986, + "flos": 18985289443200.0, + "grad_norm": 2.137935974741238, + "language_loss": 0.77576292, + "learning_rate": 4.083667263243564e-07, + "loss": 0.79766107, + "num_input_tokens_seen": 143312745, + "step": 6646, + "time_per_iteration": 2.4386520385742188 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01025181, + "balance_loss_clip": 1.04839861, + "balance_loss_mlp": 1.018502, + "epoch": 0.7992544940780376, + "flos": 20816805974400.0, + "grad_norm": 1.9925427171261334, + "language_loss": 0.71968889, + "learning_rate": 4.0789515037781653e-07, + "loss": 0.741445, + "num_input_tokens_seen": 143333470, + "step": 6647, + "time_per_iteration": 2.4792494773864746 + }, + { + "auxiliary_loss_clip": 0.01155546, + "auxiliary_loss_mlp": 0.01022919, + "balance_loss_clip": 1.04755127, + "balance_loss_mlp": 1.01606393, + "epoch": 0.7993747369686768, + "flos": 12640772321280.0, + "grad_norm": 1.7584999545295306, + "language_loss": 0.82385117, + "learning_rate": 4.0742381595231755e-07, + "loss": 0.84563583, + "num_input_tokens_seen": 143350195, + "step": 6648, + "time_per_iteration": 2.477421522140503 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.04492748, + "balance_loss_mlp": 1.01849341, + "epoch": 0.7994949798593158, + "flos": 20078769225600.0, + "grad_norm": 2.2203364577199, + "language_loss": 0.78458756, + "learning_rate": 4.06952723119359e-07, + "loss": 0.80611795, + "num_input_tokens_seen": 143370070, + "step": 6649, + "time_per_iteration": 2.5374505519866943 + }, + { + "auxiliary_loss_clip": 0.01127601, + "auxiliary_loss_mlp": 0.01026033, + "balance_loss_clip": 1.04227853, + "balance_loss_mlp": 1.0186007, + "epoch": 0.7996152227499549, + "flos": 38654209509120.0, + "grad_norm": 1.8569947275147876, + "language_loss": 0.67374372, + "learning_rate": 4.0648187195040504e-07, + "loss": 0.69528008, + "num_input_tokens_seen": 143392275, + "step": 6650, + "time_per_iteration": 2.6422083377838135 + }, + { + "auxiliary_loss_clip": 0.01049539, + "auxiliary_loss_mlp": 0.01002018, + "balance_loss_clip": 1.00941777, + "balance_loss_mlp": 1.00083816, + "epoch": 0.799735465640594, + "flos": 70243821947520.0, + "grad_norm": 0.8104505219086647, + "language_loss": 0.6760093, + "learning_rate": 4.060112625168848e-07, + "loss": 0.69652486, + "num_input_tokens_seen": 143457385, + "step": 6651, + "time_per_iteration": 3.1255669593811035 + }, + { + "auxiliary_loss_clip": 0.01166879, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.04918313, + "balance_loss_mlp": 1.01689172, + "epoch": 0.7998557085312331, + "flos": 24240995550720.0, + "grad_norm": 1.8984052468269064, + "language_loss": 0.73776877, + "learning_rate": 4.055408948901886e-07, + "loss": 0.75967729, + "num_input_tokens_seen": 143478785, + "step": 6652, + "time_per_iteration": 2.4691245555877686 + }, + { + "auxiliary_loss_clip": 0.01155409, + "auxiliary_loss_mlp": 0.01023496, + "balance_loss_clip": 1.04716694, + "balance_loss_mlp": 1.01551473, + "epoch": 0.7999759514218722, + "flos": 27564025449600.0, + "grad_norm": 2.258278549137495, + "language_loss": 0.71474397, + "learning_rate": 4.050707691416708e-07, + "loss": 0.73653305, + "num_input_tokens_seen": 143500095, + "step": 6653, + "time_per_iteration": 2.503244161605835 + }, + { + "auxiliary_loss_clip": 0.01049821, + "auxiliary_loss_mlp": 0.01001798, + "balance_loss_clip": 1.0097363, + "balance_loss_mlp": 1.0006417, + "epoch": 0.8000961943125112, + "flos": 67337428878720.0, + "grad_norm": 0.6762803425296061, + "language_loss": 0.59772789, + "learning_rate": 4.046008853426495e-07, + "loss": 0.61824417, + "num_input_tokens_seen": 143563410, + "step": 6654, + "time_per_iteration": 3.1881251335144043 + }, + { + "auxiliary_loss_clip": 0.01118773, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.04262495, + "balance_loss_mlp": 1.017977, + "epoch": 0.8002164372031504, + "flos": 28733815676160.0, + "grad_norm": 4.545113583885504, + "language_loss": 0.62991476, + "learning_rate": 4.0413124356440464e-07, + "loss": 0.65135515, + "num_input_tokens_seen": 143587455, + "step": 6655, + "time_per_iteration": 3.439544200897217 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.01025344, + "balance_loss_clip": 1.04144979, + "balance_loss_mlp": 1.01833725, + "epoch": 0.8003366800937894, + "flos": 17639429725440.0, + "grad_norm": 1.8082924126450304, + "language_loss": 0.82063144, + "learning_rate": 4.0366184387818223e-07, + "loss": 0.8420161, + "num_input_tokens_seen": 143605915, + "step": 6656, + "time_per_iteration": 2.5539333820343018 + }, + { + "auxiliary_loss_clip": 0.01171186, + "auxiliary_loss_mlp": 0.01022304, + "balance_loss_clip": 1.04884708, + "balance_loss_mlp": 1.01445675, + "epoch": 0.8004569229844285, + "flos": 25995303797760.0, + "grad_norm": 1.7958345363539914, + "language_loss": 0.85095513, + "learning_rate": 4.0319268635518797e-07, + "loss": 0.87289011, + "num_input_tokens_seen": 143626490, + "step": 6657, + "time_per_iteration": 2.4990313053131104 + }, + { + "auxiliary_loss_clip": 0.01151284, + "auxiliary_loss_mlp": 0.01018873, + "balance_loss_clip": 1.0458653, + "balance_loss_mlp": 1.01235521, + "epoch": 0.8005771658750677, + "flos": 20812352688000.0, + "grad_norm": 1.628504546613032, + "language_loss": 0.75346142, + "learning_rate": 4.027237710665943e-07, + "loss": 0.77516305, + "num_input_tokens_seen": 143644955, + "step": 6658, + "time_per_iteration": 3.2879130840301514 + }, + { + "auxiliary_loss_clip": 0.01125352, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.04112196, + "balance_loss_mlp": 1.01829529, + "epoch": 0.8006974087657067, + "flos": 25812626204160.0, + "grad_norm": 2.697019971360361, + "language_loss": 0.69361913, + "learning_rate": 4.022550980835344e-07, + "loss": 0.71512932, + "num_input_tokens_seen": 143667200, + "step": 6659, + "time_per_iteration": 2.60933256149292 + }, + { + "auxiliary_loss_clip": 0.01121099, + "auxiliary_loss_mlp": 0.01022905, + "balance_loss_clip": 1.04010904, + "balance_loss_mlp": 1.0158838, + "epoch": 0.8008176516563458, + "flos": 17164690646400.0, + "grad_norm": 2.2842929689764855, + "language_loss": 0.79404354, + "learning_rate": 4.017866674771051e-07, + "loss": 0.81548357, + "num_input_tokens_seen": 143684685, + "step": 6660, + "time_per_iteration": 2.5000317096710205 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.03868544, + "balance_loss_mlp": 1.02040291, + "epoch": 0.8009378945469849, + "flos": 24207311571840.0, + "grad_norm": 1.8961428877580324, + "language_loss": 0.74707818, + "learning_rate": 4.013184793183688e-07, + "loss": 0.76836503, + "num_input_tokens_seen": 143706780, + "step": 6661, + "time_per_iteration": 3.3352625370025635 + }, + { + "auxiliary_loss_clip": 0.01149144, + "auxiliary_loss_mlp": 0.01025466, + "balance_loss_clip": 1.04364705, + "balance_loss_mlp": 1.01876688, + "epoch": 0.801058137437624, + "flos": 19787318271360.0, + "grad_norm": 1.8881118416660736, + "language_loss": 0.72744876, + "learning_rate": 4.008505336783472e-07, + "loss": 0.74919486, + "num_input_tokens_seen": 143724505, + "step": 6662, + "time_per_iteration": 2.44775128364563 + }, + { + "auxiliary_loss_clip": 0.01141981, + "auxiliary_loss_mlp": 0.01024167, + "balance_loss_clip": 1.04392195, + "balance_loss_mlp": 1.01752436, + "epoch": 0.801178380328263, + "flos": 18659400324480.0, + "grad_norm": 1.9128885411117909, + "language_loss": 0.80724239, + "learning_rate": 4.003828306280284e-07, + "loss": 0.82890391, + "num_input_tokens_seen": 143742180, + "step": 6663, + "time_per_iteration": 2.4275035858154297 + }, + { + "auxiliary_loss_clip": 0.01152745, + "auxiliary_loss_mlp": 0.01022696, + "balance_loss_clip": 1.0462929, + "balance_loss_mlp": 1.01620793, + "epoch": 0.8012986232189022, + "flos": 15706573948800.0, + "grad_norm": 1.8353789082887482, + "language_loss": 0.7816295, + "learning_rate": 3.999153702383626e-07, + "loss": 0.80338395, + "num_input_tokens_seen": 143760070, + "step": 6664, + "time_per_iteration": 2.4279041290283203 + }, + { + "auxiliary_loss_clip": 0.01154723, + "auxiliary_loss_mlp": 0.01023236, + "balance_loss_clip": 1.04591537, + "balance_loss_mlp": 1.0157851, + "epoch": 0.8014188661095413, + "flos": 28584139703040.0, + "grad_norm": 1.786727106013486, + "language_loss": 0.73600835, + "learning_rate": 3.9944815258026263e-07, + "loss": 0.757788, + "num_input_tokens_seen": 143781890, + "step": 6665, + "time_per_iteration": 2.5283732414245605 + }, + { + "auxiliary_loss_clip": 0.01155398, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.04737949, + "balance_loss_mlp": 1.01794958, + "epoch": 0.8015391090001803, + "flos": 29310360877440.0, + "grad_norm": 1.7974828799829023, + "language_loss": 0.83458436, + "learning_rate": 3.989811777246057e-07, + "loss": 0.85639292, + "num_input_tokens_seen": 143802060, + "step": 6666, + "time_per_iteration": 2.5059494972229004 + }, + { + "auxiliary_loss_clip": 0.01064879, + "auxiliary_loss_mlp": 0.01000984, + "balance_loss_clip": 1.0104183, + "balance_loss_mlp": 0.99997026, + "epoch": 0.8016593518908195, + "flos": 70397340675840.0, + "grad_norm": 0.8520184815143531, + "language_loss": 0.66284394, + "learning_rate": 3.985144457422305e-07, + "loss": 0.68350255, + "num_input_tokens_seen": 143856345, + "step": 6667, + "time_per_iteration": 2.964301586151123 + }, + { + "auxiliary_loss_clip": 0.01166185, + "auxiliary_loss_mlp": 0.01022195, + "balance_loss_clip": 1.04805779, + "balance_loss_mlp": 1.01520085, + "epoch": 0.8017795947814585, + "flos": 26026114688640.0, + "grad_norm": 2.0642245157263837, + "language_loss": 0.76509959, + "learning_rate": 3.9804795670394096e-07, + "loss": 0.78698337, + "num_input_tokens_seen": 143876470, + "step": 6668, + "time_per_iteration": 2.4591193199157715 + }, + { + "auxiliary_loss_clip": 0.0112999, + "auxiliary_loss_mlp": 0.01023939, + "balance_loss_clip": 1.04341531, + "balance_loss_mlp": 1.01714134, + "epoch": 0.8018998376720976, + "flos": 22087181260800.0, + "grad_norm": 1.5969724554204254, + "language_loss": 0.70731962, + "learning_rate": 3.975817106805022e-07, + "loss": 0.72885883, + "num_input_tokens_seen": 143895170, + "step": 6669, + "time_per_iteration": 2.4780917167663574 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01027451, + "balance_loss_clip": 1.04396296, + "balance_loss_mlp": 1.0199343, + "epoch": 0.8020200805627368, + "flos": 34568545023360.0, + "grad_norm": 2.1789523549842444, + "language_loss": 0.64707994, + "learning_rate": 3.97115707742645e-07, + "loss": 0.66860229, + "num_input_tokens_seen": 143915845, + "step": 6670, + "time_per_iteration": 2.634737491607666 + }, + { + "auxiliary_loss_clip": 0.01140208, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.04683733, + "balance_loss_mlp": 1.01445496, + "epoch": 0.8021403234533758, + "flos": 20120354196480.0, + "grad_norm": 2.0240657156320894, + "language_loss": 0.65097427, + "learning_rate": 3.966499479610599e-07, + "loss": 0.67258811, + "num_input_tokens_seen": 143933940, + "step": 6671, + "time_per_iteration": 2.4889557361602783 + }, + { + "auxiliary_loss_clip": 0.01120821, + "auxiliary_loss_mlp": 0.01022906, + "balance_loss_clip": 1.04516745, + "balance_loss_mlp": 1.01626611, + "epoch": 0.8022605663440149, + "flos": 27746200252800.0, + "grad_norm": 1.772648516596436, + "language_loss": 0.65133452, + "learning_rate": 3.9618443140640225e-07, + "loss": 0.67277181, + "num_input_tokens_seen": 143952850, + "step": 6672, + "time_per_iteration": 2.549924850463867 + }, + { + "auxiliary_loss_clip": 0.0102025, + "auxiliary_loss_mlp": 0.0100202, + "balance_loss_clip": 1.00861204, + "balance_loss_mlp": 1.0009768, + "epoch": 0.802380809234654, + "flos": 60244998768000.0, + "grad_norm": 0.6882311653034767, + "language_loss": 0.51398444, + "learning_rate": 3.957191581492918e-07, + "loss": 0.53420717, + "num_input_tokens_seen": 144013610, + "step": 6673, + "time_per_iteration": 3.1349406242370605 + }, + { + "auxiliary_loss_clip": 0.0113414, + "auxiliary_loss_mlp": 0.01022172, + "balance_loss_clip": 1.04385209, + "balance_loss_mlp": 1.01503181, + "epoch": 0.8025010521252931, + "flos": 15080722352640.0, + "grad_norm": 2.6127279831027876, + "language_loss": 0.70470041, + "learning_rate": 3.952541282603097e-07, + "loss": 0.72626352, + "num_input_tokens_seen": 144028715, + "step": 6674, + "time_per_iteration": 2.43558669090271 + }, + { + "auxiliary_loss_clip": 0.01150407, + "auxiliary_loss_mlp": 0.01021929, + "balance_loss_clip": 1.04614091, + "balance_loss_mlp": 1.01501822, + "epoch": 0.8026212950159322, + "flos": 22163527618560.0, + "grad_norm": 2.0698465936192596, + "language_loss": 0.83650875, + "learning_rate": 3.9478934181000013e-07, + "loss": 0.85823214, + "num_input_tokens_seen": 144048740, + "step": 6675, + "time_per_iteration": 2.449049234390259 + }, + { + "auxiliary_loss_clip": 0.01169052, + "auxiliary_loss_mlp": 0.01022378, + "balance_loss_clip": 1.0476408, + "balance_loss_mlp": 1.01522565, + "epoch": 0.8027415379065713, + "flos": 17675986792320.0, + "grad_norm": 2.487618029224469, + "language_loss": 0.84529996, + "learning_rate": 3.943247988688714e-07, + "loss": 0.8672142, + "num_input_tokens_seen": 144067435, + "step": 6676, + "time_per_iteration": 2.3897159099578857 + }, + { + "auxiliary_loss_clip": 0.01152194, + "auxiliary_loss_mlp": 0.01022045, + "balance_loss_clip": 1.0463289, + "balance_loss_mlp": 1.01578331, + "epoch": 0.8028617807972104, + "flos": 21979593048960.0, + "grad_norm": 1.6999376240464537, + "language_loss": 0.72038507, + "learning_rate": 3.938604995073933e-07, + "loss": 0.74212742, + "num_input_tokens_seen": 144085905, + "step": 6677, + "time_per_iteration": 2.531378984451294 + }, + { + "auxiliary_loss_clip": 0.01140265, + "auxiliary_loss_mlp": 0.01024485, + "balance_loss_clip": 1.04425871, + "balance_loss_mlp": 1.0176903, + "epoch": 0.8029820236878494, + "flos": 26428457905920.0, + "grad_norm": 1.8971618848262934, + "language_loss": 0.65273595, + "learning_rate": 3.9339644379600157e-07, + "loss": 0.67438352, + "num_input_tokens_seen": 144105735, + "step": 6678, + "time_per_iteration": 2.527109384536743 + }, + { + "auxiliary_loss_clip": 0.01156656, + "auxiliary_loss_mlp": 0.01025693, + "balance_loss_clip": 1.04819679, + "balance_loss_mlp": 1.01890421, + "epoch": 0.8031022665784886, + "flos": 17676489582720.0, + "grad_norm": 1.7219975960449956, + "language_loss": 0.71130437, + "learning_rate": 3.929326318050907e-07, + "loss": 0.73312783, + "num_input_tokens_seen": 144123405, + "step": 6679, + "time_per_iteration": 2.4120123386383057 + }, + { + "auxiliary_loss_clip": 0.0116192, + "auxiliary_loss_mlp": 0.01024445, + "balance_loss_clip": 1.04530418, + "balance_loss_mlp": 1.01745081, + "epoch": 0.8032225094691277, + "flos": 15450279431040.0, + "grad_norm": 2.2371876749609902, + "language_loss": 0.78963286, + "learning_rate": 3.924690636050225e-07, + "loss": 0.8114965, + "num_input_tokens_seen": 144140815, + "step": 6680, + "time_per_iteration": 2.3645241260528564 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01026626, + "balance_loss_clip": 1.04784143, + "balance_loss_mlp": 1.01846623, + "epoch": 0.8033427523597667, + "flos": 26179202453760.0, + "grad_norm": 2.23077868817721, + "language_loss": 0.73061395, + "learning_rate": 3.9200573926611915e-07, + "loss": 0.75242168, + "num_input_tokens_seen": 144162230, + "step": 6681, + "time_per_iteration": 2.476757526397705 + }, + { + "auxiliary_loss_clip": 0.01152857, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.04971278, + "balance_loss_mlp": 1.01810765, + "epoch": 0.8034629952504058, + "flos": 21324905809920.0, + "grad_norm": 1.9271129731830863, + "language_loss": 0.72951698, + "learning_rate": 3.9154265885866613e-07, + "loss": 0.75130057, + "num_input_tokens_seen": 144181540, + "step": 6682, + "time_per_iteration": 3.3160345554351807 + }, + { + "auxiliary_loss_clip": 0.01149921, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.04675198, + "balance_loss_mlp": 1.0178771, + "epoch": 0.8035832381410449, + "flos": 21651585027840.0, + "grad_norm": 2.7474751679527434, + "language_loss": 0.75135398, + "learning_rate": 3.9107982245291394e-07, + "loss": 0.77310973, + "num_input_tokens_seen": 144199665, + "step": 6683, + "time_per_iteration": 3.273348331451416 + }, + { + "auxiliary_loss_clip": 0.01124769, + "auxiliary_loss_mlp": 0.01025883, + "balance_loss_clip": 1.04568338, + "balance_loss_mlp": 1.01844466, + "epoch": 0.803703481031684, + "flos": 20518818744960.0, + "grad_norm": 2.376624634240618, + "language_loss": 0.77103812, + "learning_rate": 3.9061723011907245e-07, + "loss": 0.79254466, + "num_input_tokens_seen": 144219020, + "step": 6684, + "time_per_iteration": 2.5193569660186768 + }, + { + "auxiliary_loss_clip": 0.011371, + "auxiliary_loss_mlp": 0.01023423, + "balance_loss_clip": 1.04436541, + "balance_loss_mlp": 1.01645815, + "epoch": 0.803823723922323, + "flos": 22854807838080.0, + "grad_norm": 1.78926650257332, + "language_loss": 0.79217517, + "learning_rate": 3.901548819273179e-07, + "loss": 0.81378043, + "num_input_tokens_seen": 144239035, + "step": 6685, + "time_per_iteration": 3.3258824348449707 + }, + { + "auxiliary_loss_clip": 0.01153621, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.04795897, + "balance_loss_mlp": 1.01842475, + "epoch": 0.8039439668129622, + "flos": 21362145235200.0, + "grad_norm": 2.420503086853807, + "language_loss": 0.69310957, + "learning_rate": 3.896927779477881e-07, + "loss": 0.71490288, + "num_input_tokens_seen": 144258295, + "step": 6686, + "time_per_iteration": 2.447969436645508 + }, + { + "auxiliary_loss_clip": 0.01123779, + "auxiliary_loss_mlp": 0.01022723, + "balance_loss_clip": 1.04276121, + "balance_loss_mlp": 1.01528442, + "epoch": 0.8040642097036013, + "flos": 23802382575360.0, + "grad_norm": 2.0869171929170998, + "language_loss": 0.66904062, + "learning_rate": 3.892309182505833e-07, + "loss": 0.69050562, + "num_input_tokens_seen": 144276110, + "step": 6687, + "time_per_iteration": 2.5395350456237793 + }, + { + "auxiliary_loss_clip": 0.01163261, + "auxiliary_loss_mlp": 0.01024587, + "balance_loss_clip": 1.04517627, + "balance_loss_mlp": 1.01760769, + "epoch": 0.8041844525942403, + "flos": 25922046009600.0, + "grad_norm": 2.2044442396972013, + "language_loss": 0.86270356, + "learning_rate": 3.887693029057675e-07, + "loss": 0.8845821, + "num_input_tokens_seen": 144295620, + "step": 6688, + "time_per_iteration": 3.204310178756714 + }, + { + "auxiliary_loss_clip": 0.01136067, + "auxiliary_loss_mlp": 0.01023722, + "balance_loss_clip": 1.04409742, + "balance_loss_mlp": 1.01681995, + "epoch": 0.8043046954848795, + "flos": 25191120153600.0, + "grad_norm": 1.621480719087694, + "language_loss": 0.81293631, + "learning_rate": 3.8830793198336684e-07, + "loss": 0.83453417, + "num_input_tokens_seen": 144315210, + "step": 6689, + "time_per_iteration": 2.5239224433898926 + }, + { + "auxiliary_loss_clip": 0.0115652, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.04580259, + "balance_loss_mlp": 1.01953149, + "epoch": 0.8044249383755185, + "flos": 41719185123840.0, + "grad_norm": 1.6535277724128856, + "language_loss": 0.70343477, + "learning_rate": 3.878468055533721e-07, + "loss": 0.7252652, + "num_input_tokens_seen": 144337750, + "step": 6690, + "time_per_iteration": 2.612668514251709 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.04634726, + "balance_loss_mlp": 1.01890326, + "epoch": 0.8045451812661576, + "flos": 20631434860800.0, + "grad_norm": 2.4727561753325453, + "language_loss": 0.84178805, + "learning_rate": 3.8738592368573464e-07, + "loss": 0.86334938, + "num_input_tokens_seen": 144355305, + "step": 6691, + "time_per_iteration": 2.504849672317505 + }, + { + "auxiliary_loss_clip": 0.01114367, + "auxiliary_loss_mlp": 0.01023368, + "balance_loss_clip": 1.04309714, + "balance_loss_mlp": 1.01620388, + "epoch": 0.8046654241567968, + "flos": 29711806254720.0, + "grad_norm": 2.2698168009613346, + "language_loss": 0.87997055, + "learning_rate": 3.8692528645037137e-07, + "loss": 0.90134788, + "num_input_tokens_seen": 144374485, + "step": 6692, + "time_per_iteration": 2.547819137573242 + }, + { + "auxiliary_loss_clip": 0.01165869, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.04839516, + "balance_loss_mlp": 1.01750088, + "epoch": 0.8047856670474358, + "flos": 17671389851520.0, + "grad_norm": 2.1837270677836833, + "language_loss": 0.77791119, + "learning_rate": 3.8646489391715907e-07, + "loss": 0.79981256, + "num_input_tokens_seen": 144388780, + "step": 6693, + "time_per_iteration": 2.38718581199646 + }, + { + "auxiliary_loss_clip": 0.01136008, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.04445839, + "balance_loss_mlp": 1.02090669, + "epoch": 0.8049059099380749, + "flos": 17120699464320.0, + "grad_norm": 2.3759723085219804, + "language_loss": 0.8833952, + "learning_rate": 3.8600474615593903e-07, + "loss": 0.90503579, + "num_input_tokens_seen": 144403395, + "step": 6694, + "time_per_iteration": 2.420403242111206 + }, + { + "auxiliary_loss_clip": 0.01034693, + "auxiliary_loss_mlp": 0.01001139, + "balance_loss_clip": 1.01007438, + "balance_loss_mlp": 1.00019681, + "epoch": 0.805026152828714, + "flos": 62212903240320.0, + "grad_norm": 0.7862004234973567, + "language_loss": 0.59673691, + "learning_rate": 3.8554484323651605e-07, + "loss": 0.61709523, + "num_input_tokens_seen": 144465265, + "step": 6695, + "time_per_iteration": 3.1427159309387207 + }, + { + "auxiliary_loss_clip": 0.01151022, + "auxiliary_loss_mlp": 0.00761606, + "balance_loss_clip": 1.04756498, + "balance_loss_mlp": 1.00050426, + "epoch": 0.8051463957193531, + "flos": 21688608971520.0, + "grad_norm": 1.6207591652070437, + "language_loss": 0.79389042, + "learning_rate": 3.85085185228657e-07, + "loss": 0.81301665, + "num_input_tokens_seen": 144484235, + "step": 6696, + "time_per_iteration": 2.4662370681762695 + }, + { + "auxiliary_loss_clip": 0.01133228, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.04390085, + "balance_loss_mlp": 1.0196048, + "epoch": 0.8052666386099921, + "flos": 32051458535040.0, + "grad_norm": 2.027045296516793, + "language_loss": 0.73432875, + "learning_rate": 3.8462577220209114e-07, + "loss": 0.75592899, + "num_input_tokens_seen": 144504610, + "step": 6697, + "time_per_iteration": 2.560473918914795 + }, + { + "auxiliary_loss_clip": 0.01064645, + "auxiliary_loss_mlp": 0.01001232, + "balance_loss_clip": 1.01025391, + "balance_loss_mlp": 1.00022495, + "epoch": 0.8053868815006313, + "flos": 67157875768320.0, + "grad_norm": 0.7226530099967193, + "language_loss": 0.59050536, + "learning_rate": 3.8416660422651127e-07, + "loss": 0.61116409, + "num_input_tokens_seen": 144574260, + "step": 6698, + "time_per_iteration": 3.09350848197937 + }, + { + "auxiliary_loss_clip": 0.01126881, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.04254925, + "balance_loss_mlp": 1.02010632, + "epoch": 0.8055071243912704, + "flos": 23837000307840.0, + "grad_norm": 2.1423675499563575, + "language_loss": 0.67922997, + "learning_rate": 3.837076813715723e-07, + "loss": 0.70077622, + "num_input_tokens_seen": 144594145, + "step": 6699, + "time_per_iteration": 2.529806613922119 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01022613, + "balance_loss_clip": 1.04094076, + "balance_loss_mlp": 1.01503682, + "epoch": 0.8056273672819094, + "flos": 21324510760320.0, + "grad_norm": 1.6593974194596193, + "language_loss": 0.75310874, + "learning_rate": 3.832490037068941e-07, + "loss": 0.77454895, + "num_input_tokens_seen": 144612935, + "step": 6700, + "time_per_iteration": 2.5115110874176025 + }, + { + "auxiliary_loss_clip": 0.01094882, + "auxiliary_loss_mlp": 0.01020787, + "balance_loss_clip": 1.0410533, + "balance_loss_mlp": 1.0135628, + "epoch": 0.8057476101725486, + "flos": 25768383626880.0, + "grad_norm": 1.8502713842368181, + "language_loss": 0.75766021, + "learning_rate": 3.827905713020554e-07, + "loss": 0.77881694, + "num_input_tokens_seen": 144630580, + "step": 6701, + "time_per_iteration": 2.5936341285705566 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.04121804, + "balance_loss_mlp": 1.01959968, + "epoch": 0.8058678530631876, + "flos": 24535283679360.0, + "grad_norm": 2.9949510074060215, + "language_loss": 0.68960822, + "learning_rate": 3.823323842266017e-07, + "loss": 0.71114624, + "num_input_tokens_seen": 144649975, + "step": 6702, + "time_per_iteration": 2.5584065914154053 + }, + { + "auxiliary_loss_clip": 0.01151998, + "auxiliary_loss_mlp": 0.01026027, + "balance_loss_clip": 1.04330182, + "balance_loss_mlp": 1.01885414, + "epoch": 0.8059880959538267, + "flos": 24753728240640.0, + "grad_norm": 2.5664519967402004, + "language_loss": 0.7320441, + "learning_rate": 3.818744425500393e-07, + "loss": 0.75382435, + "num_input_tokens_seen": 144667990, + "step": 6703, + "time_per_iteration": 2.468947172164917 + }, + { + "auxiliary_loss_clip": 0.01117729, + "auxiliary_loss_mlp": 0.0102817, + "balance_loss_clip": 1.04089403, + "balance_loss_mlp": 1.02005482, + "epoch": 0.8061083388444659, + "flos": 22196349671040.0, + "grad_norm": 1.8267384998821306, + "language_loss": 0.80989528, + "learning_rate": 3.8141674634183675e-07, + "loss": 0.83135426, + "num_input_tokens_seen": 144687020, + "step": 6704, + "time_per_iteration": 2.511730670928955 + }, + { + "auxiliary_loss_clip": 0.01107883, + "auxiliary_loss_mlp": 0.01024559, + "balance_loss_clip": 1.04325318, + "balance_loss_mlp": 1.01813102, + "epoch": 0.8062285817351049, + "flos": 30044195735040.0, + "grad_norm": 1.7698414397158995, + "language_loss": 0.66323364, + "learning_rate": 3.809592956714278e-07, + "loss": 0.68455803, + "num_input_tokens_seen": 144710255, + "step": 6705, + "time_per_iteration": 2.619598627090454 + }, + { + "auxiliary_loss_clip": 0.01156786, + "auxiliary_loss_mlp": 0.01027432, + "balance_loss_clip": 1.04809058, + "balance_loss_mlp": 1.02072668, + "epoch": 0.806348824625744, + "flos": 22782591544320.0, + "grad_norm": 1.907890647993773, + "language_loss": 0.74567449, + "learning_rate": 3.805020906082057e-07, + "loss": 0.76751667, + "num_input_tokens_seen": 144728830, + "step": 6706, + "time_per_iteration": 2.48353910446167 + }, + { + "auxiliary_loss_clip": 0.01141351, + "auxiliary_loss_mlp": 0.01023647, + "balance_loss_clip": 1.04517627, + "balance_loss_mlp": 1.01607728, + "epoch": 0.8064690675163831, + "flos": 23404600385280.0, + "grad_norm": 2.3937022476838288, + "language_loss": 0.80873859, + "learning_rate": 3.8004513122152917e-07, + "loss": 0.83038855, + "num_input_tokens_seen": 144747140, + "step": 6707, + "time_per_iteration": 2.474058151245117 + }, + { + "auxiliary_loss_clip": 0.0113162, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.04780769, + "balance_loss_mlp": 1.02523577, + "epoch": 0.8065893104070222, + "flos": 24060903736320.0, + "grad_norm": 1.685016397270131, + "language_loss": 0.66997236, + "learning_rate": 3.79588417580718e-07, + "loss": 0.69160903, + "num_input_tokens_seen": 144765250, + "step": 6708, + "time_per_iteration": 3.3262388706207275 + }, + { + "auxiliary_loss_clip": 0.01153963, + "auxiliary_loss_mlp": 0.01023628, + "balance_loss_clip": 1.04774809, + "balance_loss_mlp": 1.01699686, + "epoch": 0.8067095532976613, + "flos": 22305410340480.0, + "grad_norm": 2.1075360374718954, + "language_loss": 0.76660264, + "learning_rate": 3.791319497550558e-07, + "loss": 0.78837848, + "num_input_tokens_seen": 144783080, + "step": 6709, + "time_per_iteration": 2.4473893642425537 + }, + { + "auxiliary_loss_clip": 0.01130252, + "auxiliary_loss_mlp": 0.00761406, + "balance_loss_clip": 1.04489422, + "balance_loss_mlp": 1.00036776, + "epoch": 0.8068297961883004, + "flos": 17129498296320.0, + "grad_norm": 1.9592145636576255, + "language_loss": 0.70970368, + "learning_rate": 3.78675727813788e-07, + "loss": 0.72862029, + "num_input_tokens_seen": 144800645, + "step": 6710, + "time_per_iteration": 3.3032350540161133 + }, + { + "auxiliary_loss_clip": 0.0113773, + "auxiliary_loss_mlp": 0.01020867, + "balance_loss_clip": 1.04538417, + "balance_loss_mlp": 1.01394057, + "epoch": 0.8069500390789395, + "flos": 22018843635840.0, + "grad_norm": 1.8471727289918745, + "language_loss": 0.73801577, + "learning_rate": 3.782197518261225e-07, + "loss": 0.75960177, + "num_input_tokens_seen": 144820085, + "step": 6711, + "time_per_iteration": 3.321425199508667 + }, + { + "auxiliary_loss_clip": 0.01144001, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.04665875, + "balance_loss_mlp": 1.0232482, + "epoch": 0.8070702819695785, + "flos": 19244241567360.0, + "grad_norm": 1.9601434379668574, + "language_loss": 0.95249069, + "learning_rate": 3.777640218612319e-07, + "loss": 0.97423285, + "num_input_tokens_seen": 144838070, + "step": 6712, + "time_per_iteration": 2.44708251953125 + }, + { + "auxiliary_loss_clip": 0.01144317, + "auxiliary_loss_mlp": 0.01025124, + "balance_loss_clip": 1.0443778, + "balance_loss_mlp": 1.01818633, + "epoch": 0.8071905248602176, + "flos": 21544320038400.0, + "grad_norm": 2.1807629174518945, + "language_loss": 0.71770018, + "learning_rate": 3.773085379882488e-07, + "loss": 0.73939461, + "num_input_tokens_seen": 144857125, + "step": 6713, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01150555, + "auxiliary_loss_mlp": 0.00761898, + "balance_loss_clip": 1.04338217, + "balance_loss_mlp": 1.00043595, + "epoch": 0.8073107677508568, + "flos": 37268309105280.0, + "grad_norm": 5.107299336549649, + "language_loss": 0.76135564, + "learning_rate": 3.768533002762715e-07, + "loss": 0.78048015, + "num_input_tokens_seen": 144880660, + "step": 6714, + "time_per_iteration": 2.5883443355560303 + }, + { + "auxiliary_loss_clip": 0.01136512, + "auxiliary_loss_mlp": 0.01020915, + "balance_loss_clip": 1.04164505, + "balance_loss_mlp": 1.01426852, + "epoch": 0.8074310106414958, + "flos": 28366269759360.0, + "grad_norm": 1.8073421214175733, + "language_loss": 0.76833713, + "learning_rate": 3.763983087943572e-07, + "loss": 0.78991139, + "num_input_tokens_seen": 144900050, + "step": 6715, + "time_per_iteration": 3.2253429889678955 + }, + { + "auxiliary_loss_clip": 0.01142234, + "auxiliary_loss_mlp": 0.00761875, + "balance_loss_clip": 1.04312479, + "balance_loss_mlp": 1.00046432, + "epoch": 0.8075512535321349, + "flos": 24281646768000.0, + "grad_norm": 5.668014207877506, + "language_loss": 0.80547422, + "learning_rate": 3.759435636115282e-07, + "loss": 0.82451534, + "num_input_tokens_seen": 144920835, + "step": 6716, + "time_per_iteration": 2.5130603313446045 + }, + { + "auxiliary_loss_clip": 0.01095494, + "auxiliary_loss_mlp": 0.00761484, + "balance_loss_clip": 1.04471576, + "balance_loss_mlp": 1.0004586, + "epoch": 0.807671496422774, + "flos": 26030855283840.0, + "grad_norm": 1.690034293635486, + "language_loss": 0.72882104, + "learning_rate": 3.7548906479676967e-07, + "loss": 0.74739081, + "num_input_tokens_seen": 144940430, + "step": 6717, + "time_per_iteration": 2.62199068069458 + }, + { + "auxiliary_loss_clip": 0.01155652, + "auxiliary_loss_mlp": 0.01022447, + "balance_loss_clip": 1.04553843, + "balance_loss_mlp": 1.01581025, + "epoch": 0.8077917393134131, + "flos": 23730740899200.0, + "grad_norm": 1.7878890026842318, + "language_loss": 0.71957392, + "learning_rate": 3.7503481241902855e-07, + "loss": 0.74135494, + "num_input_tokens_seen": 144960405, + "step": 6718, + "time_per_iteration": 2.470048189163208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.00761381, + "balance_loss_clip": 1.04501224, + "balance_loss_mlp": 1.0004518, + "epoch": 0.8079119822040521, + "flos": 18402028398720.0, + "grad_norm": 1.7789515554819748, + "language_loss": 0.80462229, + "learning_rate": 3.745808065472145e-07, + "loss": 0.82362896, + "num_input_tokens_seen": 144977700, + "step": 6719, + "time_per_iteration": 2.4562885761260986 + }, + { + "auxiliary_loss_clip": 0.01151322, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.05156875, + "balance_loss_mlp": 1.02140331, + "epoch": 0.8080322250946913, + "flos": 23621787970560.0, + "grad_norm": 1.5855762524968087, + "language_loss": 0.76182711, + "learning_rate": 3.741270472501994e-07, + "loss": 0.78362209, + "num_input_tokens_seen": 144998340, + "step": 6720, + "time_per_iteration": 2.4687557220458984 + }, + { + "auxiliary_loss_clip": 0.01135863, + "auxiliary_loss_mlp": 0.01021836, + "balance_loss_clip": 1.04576635, + "balance_loss_mlp": 1.01557755, + "epoch": 0.8081524679853304, + "flos": 22820692896000.0, + "grad_norm": 1.6951240409153494, + "language_loss": 0.72786272, + "learning_rate": 3.736735345968183e-07, + "loss": 0.74943966, + "num_input_tokens_seen": 145017950, + "step": 6721, + "time_per_iteration": 2.491544723510742 + }, + { + "auxiliary_loss_clip": 0.0115311, + "auxiliary_loss_mlp": 0.0102307, + "balance_loss_clip": 1.04740298, + "balance_loss_mlp": 1.01637042, + "epoch": 0.8082727108759694, + "flos": 17640004343040.0, + "grad_norm": 2.2561155494325416, + "language_loss": 0.78834271, + "learning_rate": 3.7322026865586986e-07, + "loss": 0.81010449, + "num_input_tokens_seen": 145036985, + "step": 6722, + "time_per_iteration": 2.431783437728882 + }, + { + "auxiliary_loss_clip": 0.01158348, + "auxiliary_loss_mlp": 0.01023599, + "balance_loss_clip": 1.0487026, + "balance_loss_mlp": 1.01633954, + "epoch": 0.8083929537666086, + "flos": 25958172113280.0, + "grad_norm": 2.373226396603151, + "language_loss": 0.73515564, + "learning_rate": 3.7276724949611206e-07, + "loss": 0.75697517, + "num_input_tokens_seen": 145057095, + "step": 6723, + "time_per_iteration": 2.480518102645874 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.0102297, + "balance_loss_clip": 1.04560447, + "balance_loss_mlp": 1.0155189, + "epoch": 0.8085131966572476, + "flos": 27089178629760.0, + "grad_norm": 1.8693078251011823, + "language_loss": 0.75131035, + "learning_rate": 3.723144771862694e-07, + "loss": 0.77295911, + "num_input_tokens_seen": 145077735, + "step": 6724, + "time_per_iteration": 2.5143120288848877 + }, + { + "auxiliary_loss_clip": 0.01126972, + "auxiliary_loss_mlp": 0.0102167, + "balance_loss_clip": 1.04233742, + "balance_loss_mlp": 1.01441061, + "epoch": 0.8086334395478867, + "flos": 23988543788160.0, + "grad_norm": 1.5281396431524967, + "language_loss": 0.76768053, + "learning_rate": 3.718619517950263e-07, + "loss": 0.78916693, + "num_input_tokens_seen": 145098330, + "step": 6725, + "time_per_iteration": 2.533939838409424 + }, + { + "auxiliary_loss_clip": 0.01166834, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.04898512, + "balance_loss_mlp": 1.02228868, + "epoch": 0.8087536824385259, + "flos": 20405879406720.0, + "grad_norm": 1.9280259871147754, + "language_loss": 0.76751697, + "learning_rate": 3.714096733910301e-07, + "loss": 0.78947645, + "num_input_tokens_seen": 145115855, + "step": 6726, + "time_per_iteration": 2.4907848834991455 + }, + { + "auxiliary_loss_clip": 0.01160154, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.047575, + "balance_loss_mlp": 1.01678181, + "epoch": 0.8088739253291649, + "flos": 25919639798400.0, + "grad_norm": 1.8955932804268432, + "language_loss": 0.705755, + "learning_rate": 3.709576420428926e-07, + "loss": 0.72760129, + "num_input_tokens_seen": 145136655, + "step": 6727, + "time_per_iteration": 2.5362765789031982 + }, + { + "auxiliary_loss_clip": 0.01137981, + "auxiliary_loss_mlp": 0.01022948, + "balance_loss_clip": 1.04221606, + "balance_loss_mlp": 1.01615, + "epoch": 0.808994168219804, + "flos": 28402072640640.0, + "grad_norm": 2.3979954908928223, + "language_loss": 0.73727179, + "learning_rate": 3.7050585781918463e-07, + "loss": 0.75888109, + "num_input_tokens_seen": 145156955, + "step": 6728, + "time_per_iteration": 2.522301435470581 + }, + { + "auxiliary_loss_clip": 0.01155645, + "auxiliary_loss_mlp": 0.01025312, + "balance_loss_clip": 1.04588485, + "balance_loss_mlp": 1.01780176, + "epoch": 0.8091144111104431, + "flos": 17421056991360.0, + "grad_norm": 1.959254908405037, + "language_loss": 0.68922156, + "learning_rate": 3.700543207884428e-07, + "loss": 0.71103108, + "num_input_tokens_seen": 145173865, + "step": 6729, + "time_per_iteration": 2.4134814739227295 + }, + { + "auxiliary_loss_clip": 0.0115155, + "auxiliary_loss_mlp": 0.01023111, + "balance_loss_clip": 1.04756951, + "balance_loss_mlp": 1.01617646, + "epoch": 0.8092346540010822, + "flos": 32153803361280.0, + "grad_norm": 2.802261506359597, + "language_loss": 0.71408498, + "learning_rate": 3.6960303101916466e-07, + "loss": 0.73583168, + "num_input_tokens_seen": 145193780, + "step": 6730, + "time_per_iteration": 2.5015552043914795 + }, + { + "auxiliary_loss_clip": 0.01064216, + "auxiliary_loss_mlp": 0.00753147, + "balance_loss_clip": 1.00998366, + "balance_loss_mlp": 1.00010478, + "epoch": 0.8093548968917212, + "flos": 58035093390720.0, + "grad_norm": 0.7580382329229355, + "language_loss": 0.5553233, + "learning_rate": 3.6915198857981047e-07, + "loss": 0.57349694, + "num_input_tokens_seen": 145258980, + "step": 6731, + "time_per_iteration": 3.040640115737915 + }, + { + "auxiliary_loss_clip": 0.01123919, + "auxiliary_loss_mlp": 0.01024858, + "balance_loss_clip": 1.04523206, + "balance_loss_mlp": 1.0169307, + "epoch": 0.8094751397823604, + "flos": 27381599251200.0, + "grad_norm": 1.7873658474159342, + "language_loss": 0.68177879, + "learning_rate": 3.687011935388027e-07, + "loss": 0.70326662, + "num_input_tokens_seen": 145281875, + "step": 6732, + "time_per_iteration": 2.5652506351470947 + }, + { + "auxiliary_loss_clip": 0.01151315, + "auxiliary_loss_mlp": 0.01019669, + "balance_loss_clip": 1.0462985, + "balance_loss_mlp": 1.01307106, + "epoch": 0.8095953826729995, + "flos": 24061083304320.0, + "grad_norm": 1.9111601097421276, + "language_loss": 0.72748965, + "learning_rate": 3.6825064596452646e-07, + "loss": 0.74919951, + "num_input_tokens_seen": 145302220, + "step": 6733, + "time_per_iteration": 2.4660708904266357 + }, + { + "auxiliary_loss_clip": 0.01151579, + "auxiliary_loss_mlp": 0.01022666, + "balance_loss_clip": 1.04503703, + "balance_loss_mlp": 1.0158236, + "epoch": 0.8097156255636385, + "flos": 23951412103680.0, + "grad_norm": 1.6899498514628872, + "language_loss": 0.7094565, + "learning_rate": 3.678003459253305e-07, + "loss": 0.73119903, + "num_input_tokens_seen": 145323070, + "step": 6734, + "time_per_iteration": 3.3349273204803467 + }, + { + "auxiliary_loss_clip": 0.01124053, + "auxiliary_loss_mlp": 0.01021403, + "balance_loss_clip": 1.04286206, + "balance_loss_mlp": 1.01383352, + "epoch": 0.8098358684542777, + "flos": 21799142098560.0, + "grad_norm": 2.3888308610252293, + "language_loss": 0.7422775, + "learning_rate": 3.673502934895236e-07, + "loss": 0.76373208, + "num_input_tokens_seen": 145342575, + "step": 6735, + "time_per_iteration": 2.5073490142822266 + }, + { + "auxiliary_loss_clip": 0.01063695, + "auxiliary_loss_mlp": 0.0100167, + "balance_loss_clip": 1.00955558, + "balance_loss_mlp": 1.00069845, + "epoch": 0.8099561113449167, + "flos": 68809515966720.0, + "grad_norm": 0.6919123196324478, + "language_loss": 0.57949078, + "learning_rate": 3.669004887253802e-07, + "loss": 0.60014439, + "num_input_tokens_seen": 145408865, + "step": 6736, + "time_per_iteration": 3.1324520111083984 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.0102314, + "balance_loss_clip": 1.04639959, + "balance_loss_mlp": 1.01644981, + "epoch": 0.8100763542355558, + "flos": 23586056916480.0, + "grad_norm": 1.5942867345826883, + "language_loss": 0.78958714, + "learning_rate": 3.664509317011335e-07, + "loss": 0.8112278, + "num_input_tokens_seen": 145429200, + "step": 6737, + "time_per_iteration": 3.324171781539917 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.04933083, + "balance_loss_mlp": 1.01877201, + "epoch": 0.810196597126195, + "flos": 31650408207360.0, + "grad_norm": 2.221953221788739, + "language_loss": 0.73493624, + "learning_rate": 3.6600162248498134e-07, + "loss": 0.75673652, + "num_input_tokens_seen": 145452830, + "step": 6738, + "time_per_iteration": 3.3799679279327393 + }, + { + "auxiliary_loss_clip": 0.01081264, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_clip": 1.03793836, + "balance_loss_mlp": 1.01630449, + "epoch": 0.810316840016834, + "flos": 24900459298560.0, + "grad_norm": 1.7505623178314402, + "language_loss": 0.75748158, + "learning_rate": 3.6555256114508426e-07, + "loss": 0.77852267, + "num_input_tokens_seen": 145472625, + "step": 6739, + "time_per_iteration": 2.589928388595581 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.04095316, + "balance_loss_mlp": 1.0174644, + "epoch": 0.8104370829074731, + "flos": 27965003950080.0, + "grad_norm": 2.152393013104217, + "language_loss": 0.73196304, + "learning_rate": 3.651037477495642e-07, + "loss": 0.75357068, + "num_input_tokens_seen": 145494075, + "step": 6740, + "time_per_iteration": 2.5469439029693604 + }, + { + "auxiliary_loss_clip": 0.01165486, + "auxiliary_loss_mlp": 0.01023075, + "balance_loss_clip": 1.04593062, + "balance_loss_mlp": 1.01602674, + "epoch": 0.8105573257981122, + "flos": 24640752988800.0, + "grad_norm": 2.214866630627599, + "language_loss": 0.68145937, + "learning_rate": 3.6465518236650584e-07, + "loss": 0.703345, + "num_input_tokens_seen": 145514220, + "step": 6741, + "time_per_iteration": 3.183682441711426 + }, + { + "auxiliary_loss_clip": 0.01123966, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.04270828, + "balance_loss_mlp": 1.0214386, + "epoch": 0.8106775686887513, + "flos": 26358935132160.0, + "grad_norm": 1.6831597036051975, + "language_loss": 0.78348041, + "learning_rate": 3.642068650639558e-07, + "loss": 0.80499911, + "num_input_tokens_seen": 145533965, + "step": 6742, + "time_per_iteration": 2.5536162853240967 + }, + { + "auxiliary_loss_clip": 0.01129906, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.03913927, + "balance_loss_mlp": 1.02102757, + "epoch": 0.8107978115793903, + "flos": 27271892136960.0, + "grad_norm": 5.969952903569133, + "language_loss": 0.64504516, + "learning_rate": 3.6375879590992334e-07, + "loss": 0.66662353, + "num_input_tokens_seen": 145554310, + "step": 6743, + "time_per_iteration": 2.519423246383667 + }, + { + "auxiliary_loss_clip": 0.01133314, + "auxiliary_loss_mlp": 0.0102539, + "balance_loss_clip": 1.04300809, + "balance_loss_mlp": 1.01820803, + "epoch": 0.8109180544700295, + "flos": 24934322845440.0, + "grad_norm": 1.76173379754774, + "language_loss": 0.81191272, + "learning_rate": 3.6331097497238173e-07, + "loss": 0.83349979, + "num_input_tokens_seen": 145573755, + "step": 6744, + "time_per_iteration": 2.5046942234039307 + }, + { + "auxiliary_loss_clip": 0.0112124, + "auxiliary_loss_mlp": 0.01019148, + "balance_loss_clip": 1.04242122, + "balance_loss_mlp": 1.0123229, + "epoch": 0.8110382973606686, + "flos": 21105383840640.0, + "grad_norm": 2.112121223684056, + "language_loss": 0.8019529, + "learning_rate": 3.628634023192627e-07, + "loss": 0.82335675, + "num_input_tokens_seen": 145594000, + "step": 6745, + "time_per_iteration": 2.5282111167907715 + }, + { + "auxiliary_loss_clip": 0.011539, + "auxiliary_loss_mlp": 0.01026568, + "balance_loss_clip": 1.04620337, + "balance_loss_mlp": 1.01866996, + "epoch": 0.8111585402513076, + "flos": 15414081500160.0, + "grad_norm": 2.0551271466630947, + "language_loss": 0.75035942, + "learning_rate": 3.624160780184644e-07, + "loss": 0.77216411, + "num_input_tokens_seen": 145611215, + "step": 6746, + "time_per_iteration": 2.4097530841827393 + }, + { + "auxiliary_loss_clip": 0.0113142, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.04324627, + "balance_loss_mlp": 1.01795936, + "epoch": 0.8112787831419467, + "flos": 24095736950400.0, + "grad_norm": 2.179920322205396, + "language_loss": 0.74357545, + "learning_rate": 3.6196900213784496e-07, + "loss": 0.76514018, + "num_input_tokens_seen": 145630530, + "step": 6747, + "time_per_iteration": 2.516167640686035 + }, + { + "auxiliary_loss_clip": 0.01150903, + "auxiliary_loss_mlp": 0.01025265, + "balance_loss_clip": 1.04547477, + "balance_loss_mlp": 1.01844358, + "epoch": 0.8113990260325858, + "flos": 20483374999680.0, + "grad_norm": 1.7869277150268115, + "language_loss": 0.8668676, + "learning_rate": 3.6152217474522527e-07, + "loss": 0.88862932, + "num_input_tokens_seen": 145647345, + "step": 6748, + "time_per_iteration": 2.4633281230926514 + }, + { + "auxiliary_loss_clip": 0.01150552, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.04738283, + "balance_loss_mlp": 1.02104044, + "epoch": 0.8115192689232249, + "flos": 24901141656960.0, + "grad_norm": 1.782953547231123, + "language_loss": 0.72669113, + "learning_rate": 3.6107559590838975e-07, + "loss": 0.74847293, + "num_input_tokens_seen": 145666330, + "step": 6749, + "time_per_iteration": 2.5125668048858643 + }, + { + "auxiliary_loss_clip": 0.01091151, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.04053617, + "balance_loss_mlp": 1.01686072, + "epoch": 0.811639511813864, + "flos": 24057204635520.0, + "grad_norm": 2.306753225631151, + "language_loss": 0.66318452, + "learning_rate": 3.606292656950822e-07, + "loss": 0.68433666, + "num_input_tokens_seen": 145684740, + "step": 6750, + "time_per_iteration": 2.596280574798584 + }, + { + "auxiliary_loss_clip": 0.01133294, + "auxiliary_loss_mlp": 0.01021933, + "balance_loss_clip": 1.04214907, + "balance_loss_mlp": 1.01464307, + "epoch": 0.8117597547045031, + "flos": 23185150243200.0, + "grad_norm": 2.020430092515925, + "language_loss": 0.86557746, + "learning_rate": 3.601831841730121e-07, + "loss": 0.88712972, + "num_input_tokens_seen": 145702660, + "step": 6751, + "time_per_iteration": 2.458836317062378 + }, + { + "auxiliary_loss_clip": 0.01151578, + "auxiliary_loss_mlp": 0.01022982, + "balance_loss_clip": 1.04663992, + "balance_loss_mlp": 1.01578462, + "epoch": 0.8118799975951422, + "flos": 23040250778880.0, + "grad_norm": 1.9722918012160227, + "language_loss": 0.72822905, + "learning_rate": 3.5973735140984916e-07, + "loss": 0.74997461, + "num_input_tokens_seen": 145722830, + "step": 6752, + "time_per_iteration": 2.4769294261932373 + }, + { + "auxiliary_loss_clip": 0.01103634, + "auxiliary_loss_mlp": 0.00760804, + "balance_loss_clip": 1.03789997, + "balance_loss_mlp": 1.0003264, + "epoch": 0.8120002404857812, + "flos": 24639962889600.0, + "grad_norm": 1.9830176021582018, + "language_loss": 0.79507232, + "learning_rate": 3.5929176747322607e-07, + "loss": 0.81371665, + "num_input_tokens_seen": 145741935, + "step": 6753, + "time_per_iteration": 2.562243700027466 + }, + { + "auxiliary_loss_clip": 0.01046647, + "auxiliary_loss_mlp": 0.01000779, + "balance_loss_clip": 1.01001108, + "balance_loss_mlp": 0.9998495, + "epoch": 0.8121204833764204, + "flos": 57415742156160.0, + "grad_norm": 0.8115165272066597, + "language_loss": 0.56236714, + "learning_rate": 3.588464324307372e-07, + "loss": 0.5828414, + "num_input_tokens_seen": 145805560, + "step": 6754, + "time_per_iteration": 3.1068928241729736 + }, + { + "auxiliary_loss_clip": 0.01151819, + "auxiliary_loss_mlp": 0.01023164, + "balance_loss_clip": 1.04366028, + "balance_loss_mlp": 1.01618767, + "epoch": 0.8122407262670595, + "flos": 19464589549440.0, + "grad_norm": 2.0230850209949782, + "language_loss": 0.7553066, + "learning_rate": 3.584013463499391e-07, + "loss": 0.77705646, + "num_input_tokens_seen": 145824180, + "step": 6755, + "time_per_iteration": 2.4217872619628906 + }, + { + "auxiliary_loss_clip": 0.01045, + "auxiliary_loss_mlp": 0.01002071, + "balance_loss_clip": 1.01142633, + "balance_loss_mlp": 1.00114107, + "epoch": 0.8123609691576985, + "flos": 56425325472000.0, + "grad_norm": 0.7364622901970289, + "language_loss": 0.64498335, + "learning_rate": 3.579565092983521e-07, + "loss": 0.66545409, + "num_input_tokens_seen": 145885300, + "step": 6756, + "time_per_iteration": 2.9235947132110596 + }, + { + "auxiliary_loss_clip": 0.01167596, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.04898143, + "balance_loss_mlp": 1.02456522, + "epoch": 0.8124812120483377, + "flos": 20631973564800.0, + "grad_norm": 2.057129140048583, + "language_loss": 0.83786952, + "learning_rate": 3.575119213434565e-07, + "loss": 0.85986215, + "num_input_tokens_seen": 145903815, + "step": 6757, + "time_per_iteration": 2.4005305767059326 + }, + { + "auxiliary_loss_clip": 0.01148256, + "auxiliary_loss_mlp": 0.01020378, + "balance_loss_clip": 1.04555666, + "balance_loss_mlp": 1.01345491, + "epoch": 0.8126014549389767, + "flos": 22492397566080.0, + "grad_norm": 1.7362842786458825, + "language_loss": 0.81990516, + "learning_rate": 3.5706758255269765e-07, + "loss": 0.84159148, + "num_input_tokens_seen": 145922270, + "step": 6758, + "time_per_iteration": 2.451983690261841 + }, + { + "auxiliary_loss_clip": 0.01140172, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.04530013, + "balance_loss_mlp": 1.01828694, + "epoch": 0.8127216978296158, + "flos": 23287961946240.0, + "grad_norm": 1.484807185471433, + "language_loss": 0.69738537, + "learning_rate": 3.566234929934795e-07, + "loss": 0.71904129, + "num_input_tokens_seen": 145941470, + "step": 6759, + "time_per_iteration": 2.4744327068328857 + }, + { + "auxiliary_loss_clip": 0.01151942, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.04967868, + "balance_loss_mlp": 1.02092731, + "epoch": 0.812841940720255, + "flos": 25154994049920.0, + "grad_norm": 1.467336948036511, + "language_loss": 0.717875, + "learning_rate": 3.561796527331706e-07, + "loss": 0.73967457, + "num_input_tokens_seen": 145963145, + "step": 6760, + "time_per_iteration": 2.4990997314453125 + }, + { + "auxiliary_loss_clip": 0.0112695, + "auxiliary_loss_mlp": 0.01021798, + "balance_loss_clip": 1.04342651, + "balance_loss_mlp": 1.01450837, + "epoch": 0.812962183610894, + "flos": 26648446752000.0, + "grad_norm": 3.316365297541003, + "language_loss": 0.77720737, + "learning_rate": 3.5573606183910163e-07, + "loss": 0.79869485, + "num_input_tokens_seen": 145983150, + "step": 6761, + "time_per_iteration": 3.4013988971710205 + }, + { + "auxiliary_loss_clip": 0.01156931, + "auxiliary_loss_mlp": 0.01023663, + "balance_loss_clip": 1.04515123, + "balance_loss_mlp": 1.0164094, + "epoch": 0.8130824265015331, + "flos": 24966965329920.0, + "grad_norm": 3.1749222682202984, + "language_loss": 0.7878952, + "learning_rate": 3.5529272037856493e-07, + "loss": 0.80970114, + "num_input_tokens_seen": 146001365, + "step": 6762, + "time_per_iteration": 2.455958843231201 + }, + { + "auxiliary_loss_clip": 0.01019128, + "auxiliary_loss_mlp": 0.01001795, + "balance_loss_clip": 1.01098633, + "balance_loss_mlp": 1.00054884, + "epoch": 0.8132026693921722, + "flos": 67622918175360.0, + "grad_norm": 0.7079936112692861, + "language_loss": 0.5393635, + "learning_rate": 3.548496284188149e-07, + "loss": 0.5595727, + "num_input_tokens_seen": 146061570, + "step": 6763, + "time_per_iteration": 4.004713773727417 + }, + { + "auxiliary_loss_clip": 0.01106041, + "auxiliary_loss_mlp": 0.01022403, + "balance_loss_clip": 1.04592276, + "balance_loss_mlp": 1.01542282, + "epoch": 0.8133229122828113, + "flos": 19495149045120.0, + "grad_norm": 1.788848618826129, + "language_loss": 0.7925967, + "learning_rate": 3.544067860270681e-07, + "loss": 0.81388116, + "num_input_tokens_seen": 146079145, + "step": 6764, + "time_per_iteration": 2.506537437438965 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01022283, + "balance_loss_clip": 1.04371929, + "balance_loss_mlp": 1.01499963, + "epoch": 0.8134431551734503, + "flos": 20668135582080.0, + "grad_norm": 1.6238590636023333, + "language_loss": 0.71366942, + "learning_rate": 3.539641932705029e-07, + "loss": 0.7351532, + "num_input_tokens_seen": 146097625, + "step": 6765, + "time_per_iteration": 3.4392738342285156 + }, + { + "auxiliary_loss_clip": 0.01168683, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_clip": 1.0468924, + "balance_loss_mlp": 1.01481187, + "epoch": 0.8135633980640895, + "flos": 21507332008320.0, + "grad_norm": 2.0457383455938745, + "language_loss": 0.76971227, + "learning_rate": 3.53521850216262e-07, + "loss": 0.7916255, + "num_input_tokens_seen": 146117195, + "step": 6766, + "time_per_iteration": 2.4210317134857178 + }, + { + "auxiliary_loss_clip": 0.0116716, + "auxiliary_loss_mlp": 0.01025439, + "balance_loss_clip": 1.04795361, + "balance_loss_mlp": 1.01800609, + "epoch": 0.8136836409547286, + "flos": 20554442058240.0, + "grad_norm": 1.8248419244985346, + "language_loss": 0.76607561, + "learning_rate": 3.530797569314461e-07, + "loss": 0.78800166, + "num_input_tokens_seen": 146136220, + "step": 6767, + "time_per_iteration": 2.449932813644409 + }, + { + "auxiliary_loss_clip": 0.0116627, + "auxiliary_loss_mlp": 0.01021435, + "balance_loss_clip": 1.04790521, + "balance_loss_mlp": 1.01436305, + "epoch": 0.8138038838453676, + "flos": 20299045380480.0, + "grad_norm": 2.0417898437070305, + "language_loss": 0.77742028, + "learning_rate": 3.5263791348312235e-07, + "loss": 0.79929733, + "num_input_tokens_seen": 146155415, + "step": 6768, + "time_per_iteration": 3.1844124794006348 + }, + { + "auxiliary_loss_clip": 0.01135365, + "auxiliary_loss_mlp": 0.01020437, + "balance_loss_clip": 1.04314303, + "balance_loss_mlp": 1.01336169, + "epoch": 0.8139241267360068, + "flos": 29789840551680.0, + "grad_norm": 1.835123071558431, + "language_loss": 0.70644391, + "learning_rate": 3.521963199383171e-07, + "loss": 0.72800195, + "num_input_tokens_seen": 146178370, + "step": 6769, + "time_per_iteration": 2.521782398223877 + }, + { + "auxiliary_loss_clip": 0.01111653, + "auxiliary_loss_mlp": 0.01024076, + "balance_loss_clip": 1.04306459, + "balance_loss_mlp": 1.01651525, + "epoch": 0.8140443696266458, + "flos": 19713270384000.0, + "grad_norm": 1.8564443512224955, + "language_loss": 0.77004403, + "learning_rate": 3.517549763640197e-07, + "loss": 0.79140127, + "num_input_tokens_seen": 146196010, + "step": 6770, + "time_per_iteration": 2.511802911758423 + }, + { + "auxiliary_loss_clip": 0.01150091, + "auxiliary_loss_mlp": 0.00761422, + "balance_loss_clip": 1.04841042, + "balance_loss_mlp": 1.0003798, + "epoch": 0.8141646125172849, + "flos": 27160568910720.0, + "grad_norm": 1.8615189306943924, + "language_loss": 0.71163589, + "learning_rate": 3.513138828271829e-07, + "loss": 0.73075098, + "num_input_tokens_seen": 146215880, + "step": 6771, + "time_per_iteration": 2.478868246078491 + }, + { + "auxiliary_loss_clip": 0.01121097, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.04437292, + "balance_loss_mlp": 1.02095151, + "epoch": 0.8142848554079241, + "flos": 39673102700160.0, + "grad_norm": 2.1408818788341955, + "language_loss": 0.70136356, + "learning_rate": 3.508730393947179e-07, + "loss": 0.72285497, + "num_input_tokens_seen": 146239135, + "step": 6772, + "time_per_iteration": 2.6523797512054443 + }, + { + "auxiliary_loss_clip": 0.01124148, + "auxiliary_loss_mlp": 0.01025436, + "balance_loss_clip": 1.04441214, + "balance_loss_mlp": 1.01801252, + "epoch": 0.8144050982985631, + "flos": 22237288197120.0, + "grad_norm": 1.5977252217901663, + "language_loss": 0.72052252, + "learning_rate": 3.504324461335024e-07, + "loss": 0.74201846, + "num_input_tokens_seen": 146259245, + "step": 6773, + "time_per_iteration": 2.5373635292053223 + }, + { + "auxiliary_loss_clip": 0.01101859, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.04027641, + "balance_loss_mlp": 1.01764369, + "epoch": 0.8145253411892022, + "flos": 23038239617280.0, + "grad_norm": 1.9788502437136732, + "language_loss": 0.88268816, + "learning_rate": 3.499921031103732e-07, + "loss": 0.90396506, + "num_input_tokens_seen": 146280015, + "step": 6774, + "time_per_iteration": 2.567244052886963 + }, + { + "auxiliary_loss_clip": 0.01133096, + "auxiliary_loss_mlp": 0.01021849, + "balance_loss_clip": 1.04251802, + "balance_loss_mlp": 1.01452994, + "epoch": 0.8146455840798413, + "flos": 24827668387200.0, + "grad_norm": 1.7630653603384026, + "language_loss": 0.78407025, + "learning_rate": 3.4955201039212987e-07, + "loss": 0.80561972, + "num_input_tokens_seen": 146300935, + "step": 6775, + "time_per_iteration": 2.546619415283203 + }, + { + "auxiliary_loss_clip": 0.01158709, + "auxiliary_loss_mlp": 0.0102378, + "balance_loss_clip": 1.04821849, + "balance_loss_mlp": 1.01678216, + "epoch": 0.8147658269704804, + "flos": 19974520978560.0, + "grad_norm": 1.8649612895370975, + "language_loss": 0.65722191, + "learning_rate": 3.4911216804553465e-07, + "loss": 0.67904675, + "num_input_tokens_seen": 146319835, + "step": 6776, + "time_per_iteration": 2.4294660091400146 + }, + { + "auxiliary_loss_clip": 0.01137693, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.04436111, + "balance_loss_mlp": 1.02024341, + "epoch": 0.8148860698611194, + "flos": 21178031097600.0, + "grad_norm": 2.32663602890063, + "language_loss": 0.70550597, + "learning_rate": 3.4867257613731017e-07, + "loss": 0.72716498, + "num_input_tokens_seen": 146339030, + "step": 6777, + "time_per_iteration": 2.457915782928467 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.0445354, + "balance_loss_mlp": 1.02190268, + "epoch": 0.8150063127517585, + "flos": 19606903234560.0, + "grad_norm": 1.7156260156982601, + "language_loss": 0.85830152, + "learning_rate": 3.4823323473414343e-07, + "loss": 0.87997961, + "num_input_tokens_seen": 146358550, + "step": 6778, + "time_per_iteration": 2.468317985534668 + }, + { + "auxiliary_loss_clip": 0.01129731, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.04342556, + "balance_loss_mlp": 1.02039576, + "epoch": 0.8151265556423977, + "flos": 22638374438400.0, + "grad_norm": 1.983731193480427, + "language_loss": 0.76251066, + "learning_rate": 3.477941439026812e-07, + "loss": 0.7840898, + "num_input_tokens_seen": 146376770, + "step": 6779, + "time_per_iteration": 2.494454860687256 + }, + { + "auxiliary_loss_clip": 0.01138949, + "auxiliary_loss_mlp": 0.01020383, + "balance_loss_clip": 1.0453769, + "balance_loss_mlp": 1.01381469, + "epoch": 0.8152467985330367, + "flos": 17968048277760.0, + "grad_norm": 1.7503836485970063, + "language_loss": 0.7342881, + "learning_rate": 3.473553037095349e-07, + "loss": 0.75588149, + "num_input_tokens_seen": 146395795, + "step": 6780, + "time_per_iteration": 2.452880382537842 + }, + { + "auxiliary_loss_clip": 0.01131214, + "auxiliary_loss_mlp": 0.01023514, + "balance_loss_clip": 1.04388928, + "balance_loss_mlp": 1.01690102, + "epoch": 0.8153670414236758, + "flos": 24969012405120.0, + "grad_norm": 1.9062589312524958, + "language_loss": 0.83227015, + "learning_rate": 3.469167142212743e-07, + "loss": 0.85381746, + "num_input_tokens_seen": 146417640, + "step": 6781, + "time_per_iteration": 2.5235676765441895 + }, + { + "auxiliary_loss_clip": 0.01152717, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_clip": 1.04702604, + "balance_loss_mlp": 1.0134542, + "epoch": 0.8154872843143149, + "flos": 31066069754880.0, + "grad_norm": 2.3758032009090173, + "language_loss": 0.63515741, + "learning_rate": 3.4647837550443337e-07, + "loss": 0.65689468, + "num_input_tokens_seen": 146436205, + "step": 6782, + "time_per_iteration": 2.5068750381469727 + }, + { + "auxiliary_loss_clip": 0.01125705, + "auxiliary_loss_mlp": 0.01024466, + "balance_loss_clip": 1.04395533, + "balance_loss_mlp": 1.01763213, + "epoch": 0.815607527204954, + "flos": 19391654983680.0, + "grad_norm": 1.7211574302120567, + "language_loss": 0.74506408, + "learning_rate": 3.460402876255086e-07, + "loss": 0.7665658, + "num_input_tokens_seen": 146453595, + "step": 6783, + "time_per_iteration": 2.486147403717041 + }, + { + "auxiliary_loss_clip": 0.01153723, + "auxiliary_loss_mlp": 0.01021259, + "balance_loss_clip": 1.04567194, + "balance_loss_mlp": 1.01410079, + "epoch": 0.815727770095593, + "flos": 26140418743680.0, + "grad_norm": 2.3238753950209503, + "language_loss": 0.72175866, + "learning_rate": 3.456024506509574e-07, + "loss": 0.74350846, + "num_input_tokens_seen": 146474515, + "step": 6784, + "time_per_iteration": 2.4733428955078125 + }, + { + "auxiliary_loss_clip": 0.01152921, + "auxiliary_loss_mlp": 0.00762019, + "balance_loss_clip": 1.04905045, + "balance_loss_mlp": 1.00027728, + "epoch": 0.8158480129862322, + "flos": 25337527989120.0, + "grad_norm": 1.5051325928858246, + "language_loss": 0.74103093, + "learning_rate": 3.4516486464719873e-07, + "loss": 0.76018029, + "num_input_tokens_seen": 146493905, + "step": 6785, + "time_per_iteration": 2.4823248386383057 + }, + { + "auxiliary_loss_clip": 0.01106708, + "auxiliary_loss_mlp": 0.01025541, + "balance_loss_clip": 1.04218411, + "balance_loss_mlp": 1.01806688, + "epoch": 0.8159682558768713, + "flos": 34423645559040.0, + "grad_norm": 2.8210384125043633, + "language_loss": 0.61852992, + "learning_rate": 3.4472752968061445e-07, + "loss": 0.6398524, + "num_input_tokens_seen": 146518335, + "step": 6786, + "time_per_iteration": 2.6731882095336914 + }, + { + "auxiliary_loss_clip": 0.01150896, + "auxiliary_loss_mlp": 0.01022862, + "balance_loss_clip": 1.04534078, + "balance_loss_mlp": 1.01609671, + "epoch": 0.8160884987675103, + "flos": 18653223185280.0, + "grad_norm": 1.8253541527372343, + "language_loss": 0.73594785, + "learning_rate": 3.442904458175475e-07, + "loss": 0.75768542, + "num_input_tokens_seen": 146535655, + "step": 6787, + "time_per_iteration": 2.4440877437591553 + }, + { + "auxiliary_loss_clip": 0.01149909, + "auxiliary_loss_mlp": 0.01022029, + "balance_loss_clip": 1.04453826, + "balance_loss_mlp": 1.01494253, + "epoch": 0.8162087416581495, + "flos": 31430527102080.0, + "grad_norm": 1.5866742510091092, + "language_loss": 0.75886804, + "learning_rate": 3.438536131243044e-07, + "loss": 0.78058738, + "num_input_tokens_seen": 146556815, + "step": 6788, + "time_per_iteration": 3.3892970085144043 + }, + { + "auxiliary_loss_clip": 0.01141134, + "auxiliary_loss_mlp": 0.01021383, + "balance_loss_clip": 1.04529071, + "balance_loss_mlp": 1.01405191, + "epoch": 0.8163289845487885, + "flos": 37593910915200.0, + "grad_norm": 2.0582690814130373, + "language_loss": 0.62014854, + "learning_rate": 3.434170316671503e-07, + "loss": 0.6417737, + "num_input_tokens_seen": 146581845, + "step": 6789, + "time_per_iteration": 2.616269826889038 + }, + { + "auxiliary_loss_clip": 0.01119483, + "auxiliary_loss_mlp": 0.01021749, + "balance_loss_clip": 1.04565001, + "balance_loss_mlp": 1.01456952, + "epoch": 0.8164492274394276, + "flos": 13953989554560.0, + "grad_norm": 2.5681373642359984, + "language_loss": 0.89551032, + "learning_rate": 3.4298070151231583e-07, + "loss": 0.91692269, + "num_input_tokens_seen": 146597245, + "step": 6790, + "time_per_iteration": 4.1213014125823975 + }, + { + "auxiliary_loss_clip": 0.01141093, + "auxiliary_loss_mlp": 0.01023911, + "balance_loss_clip": 1.04423189, + "balance_loss_mlp": 1.01686323, + "epoch": 0.8165694703300668, + "flos": 28986554747520.0, + "grad_norm": 1.807408001445995, + "language_loss": 0.59984934, + "learning_rate": 3.425446227259916e-07, + "loss": 0.62149942, + "num_input_tokens_seen": 146618210, + "step": 6791, + "time_per_iteration": 2.5338826179504395 + }, + { + "auxiliary_loss_clip": 0.01138134, + "auxiliary_loss_mlp": 0.01022011, + "balance_loss_clip": 1.04407978, + "balance_loss_mlp": 1.0158453, + "epoch": 0.8166897132207058, + "flos": 25118365155840.0, + "grad_norm": 1.8671361287771309, + "language_loss": 0.82407534, + "learning_rate": 3.421087953743296e-07, + "loss": 0.84567678, + "num_input_tokens_seen": 146637975, + "step": 6792, + "time_per_iteration": 2.5040810108184814 + }, + { + "auxiliary_loss_clip": 0.01151004, + "auxiliary_loss_mlp": 0.0102189, + "balance_loss_clip": 1.04346251, + "balance_loss_mlp": 1.01463604, + "epoch": 0.8168099561113449, + "flos": 23148593176320.0, + "grad_norm": 2.553035375556881, + "language_loss": 0.80015707, + "learning_rate": 3.416732195234464e-07, + "loss": 0.82188606, + "num_input_tokens_seen": 146658030, + "step": 6793, + "time_per_iteration": 2.4484047889709473 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01019627, + "balance_loss_clip": 1.04622722, + "balance_loss_mlp": 1.01309741, + "epoch": 0.816930199001984, + "flos": 18407666833920.0, + "grad_norm": 1.5536434911464005, + "language_loss": 0.79532695, + "learning_rate": 3.4123789523941613e-07, + "loss": 0.81707126, + "num_input_tokens_seen": 146677855, + "step": 6794, + "time_per_iteration": 3.206885814666748 + }, + { + "auxiliary_loss_clip": 0.01146089, + "auxiliary_loss_mlp": 0.0102228, + "balance_loss_clip": 1.04367006, + "balance_loss_mlp": 1.01458549, + "epoch": 0.8170504418926231, + "flos": 21251324799360.0, + "grad_norm": 1.4107944683858575, + "language_loss": 0.63365006, + "learning_rate": 3.4080282258827884e-07, + "loss": 0.65533376, + "num_input_tokens_seen": 146696230, + "step": 6795, + "time_per_iteration": 2.4373180866241455 + }, + { + "auxiliary_loss_clip": 0.01152875, + "auxiliary_loss_mlp": 0.01024394, + "balance_loss_clip": 1.04489505, + "balance_loss_mlp": 1.01735151, + "epoch": 0.8171706847832622, + "flos": 19099234362240.0, + "grad_norm": 2.140463471777983, + "language_loss": 0.72583759, + "learning_rate": 3.403680016360342e-07, + "loss": 0.74761027, + "num_input_tokens_seen": 146714835, + "step": 6796, + "time_per_iteration": 2.417269229888916 + }, + { + "auxiliary_loss_clip": 0.011466, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.04709888, + "balance_loss_mlp": 1.02022433, + "epoch": 0.8172909276739013, + "flos": 21470128496640.0, + "grad_norm": 1.710010476710125, + "language_loss": 0.67913902, + "learning_rate": 3.3993343244864403e-07, + "loss": 0.70088887, + "num_input_tokens_seen": 146734425, + "step": 6797, + "time_per_iteration": 2.448167562484741 + }, + { + "auxiliary_loss_clip": 0.01150581, + "auxiliary_loss_mlp": 0.01023064, + "balance_loss_clip": 1.04670596, + "balance_loss_mlp": 1.01618886, + "epoch": 0.8174111705645404, + "flos": 27599792417280.0, + "grad_norm": 1.5355567119877302, + "language_loss": 0.72985637, + "learning_rate": 3.394991150920323e-07, + "loss": 0.75159276, + "num_input_tokens_seen": 146757545, + "step": 6798, + "time_per_iteration": 2.4922478199005127 + }, + { + "auxiliary_loss_clip": 0.01112965, + "auxiliary_loss_mlp": 0.00762497, + "balance_loss_clip": 1.04283333, + "balance_loss_mlp": 1.00042295, + "epoch": 0.8175314134551794, + "flos": 14064594508800.0, + "grad_norm": 2.1401578866203996, + "language_loss": 0.74430454, + "learning_rate": 3.3906504963208396e-07, + "loss": 0.76305914, + "num_input_tokens_seen": 146774240, + "step": 6799, + "time_per_iteration": 2.5008223056793213 + }, + { + "auxiliary_loss_clip": 0.01105796, + "auxiliary_loss_mlp": 0.01020806, + "balance_loss_clip": 1.04337239, + "balance_loss_mlp": 1.01371288, + "epoch": 0.8176516563458186, + "flos": 22708076780160.0, + "grad_norm": 1.8264213608802478, + "language_loss": 0.66216028, + "learning_rate": 3.3863123613464774e-07, + "loss": 0.68342632, + "num_input_tokens_seen": 146793140, + "step": 6800, + "time_per_iteration": 2.5437674522399902 + }, + { + "auxiliary_loss_clip": 0.01137908, + "auxiliary_loss_mlp": 0.01024504, + "balance_loss_clip": 1.04049945, + "balance_loss_mlp": 1.0177331, + "epoch": 0.8177718992364577, + "flos": 21945406279680.0, + "grad_norm": 1.7232300717056792, + "language_loss": 0.75027788, + "learning_rate": 3.381976746655317e-07, + "loss": 0.77190197, + "num_input_tokens_seen": 146812895, + "step": 6801, + "time_per_iteration": 2.479572296142578 + }, + { + "auxiliary_loss_clip": 0.01102541, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.04279757, + "balance_loss_mlp": 1.01776099, + "epoch": 0.8178921421270967, + "flos": 22017443005440.0, + "grad_norm": 2.240674159579414, + "language_loss": 0.67516673, + "learning_rate": 3.3776436529050756e-07, + "loss": 0.69643903, + "num_input_tokens_seen": 146832445, + "step": 6802, + "time_per_iteration": 2.5210154056549072 + }, + { + "auxiliary_loss_clip": 0.01161293, + "auxiliary_loss_mlp": 0.01024136, + "balance_loss_clip": 1.04499745, + "balance_loss_mlp": 1.01685262, + "epoch": 0.8180123850177359, + "flos": 33183111496320.0, + "grad_norm": 1.5574127030815785, + "language_loss": 0.72516805, + "learning_rate": 3.373313080753073e-07, + "loss": 0.74702239, + "num_input_tokens_seen": 146856505, + "step": 6803, + "time_per_iteration": 2.507780075073242 + }, + { + "auxiliary_loss_clip": 0.01146004, + "auxiliary_loss_mlp": 0.01026121, + "balance_loss_clip": 1.0437026, + "balance_loss_mlp": 1.01911998, + "epoch": 0.8181326279083749, + "flos": 22091167670400.0, + "grad_norm": 1.5935360346107537, + "language_loss": 0.77332461, + "learning_rate": 3.3689850308562527e-07, + "loss": 0.79504585, + "num_input_tokens_seen": 146876950, + "step": 6804, + "time_per_iteration": 2.45294451713562 + }, + { + "auxiliary_loss_clip": 0.01102351, + "auxiliary_loss_mlp": 0.01025249, + "balance_loss_clip": 1.0438962, + "balance_loss_mlp": 1.01847172, + "epoch": 0.818252870799014, + "flos": 15705747936000.0, + "grad_norm": 1.988547393998068, + "language_loss": 0.7751469, + "learning_rate": 3.364659503871183e-07, + "loss": 0.79642284, + "num_input_tokens_seen": 146894885, + "step": 6805, + "time_per_iteration": 2.517223834991455 + }, + { + "auxiliary_loss_clip": 0.01121308, + "auxiliary_loss_mlp": 0.01022799, + "balance_loss_clip": 1.04175854, + "balance_loss_mlp": 1.01660275, + "epoch": 0.8183731136896532, + "flos": 18770687637120.0, + "grad_norm": 1.8962312875875167, + "language_loss": 0.83957648, + "learning_rate": 3.3603365004540417e-07, + "loss": 0.86101753, + "num_input_tokens_seen": 146913180, + "step": 6806, + "time_per_iteration": 2.4995391368865967 + }, + { + "auxiliary_loss_clip": 0.01164914, + "auxiliary_loss_mlp": 0.01025446, + "balance_loss_clip": 1.04882169, + "balance_loss_mlp": 1.01820111, + "epoch": 0.8184933565802922, + "flos": 26541792293760.0, + "grad_norm": 4.5038079164222955, + "language_loss": 0.77001929, + "learning_rate": 3.356016021260624e-07, + "loss": 0.79192287, + "num_input_tokens_seen": 146933510, + "step": 6807, + "time_per_iteration": 2.434100389480591 + }, + { + "auxiliary_loss_clip": 0.01152504, + "auxiliary_loss_mlp": 0.01023411, + "balance_loss_clip": 1.04692888, + "balance_loss_mlp": 1.01617527, + "epoch": 0.8186135994709313, + "flos": 17530117660800.0, + "grad_norm": 4.464870280662668, + "language_loss": 0.65455735, + "learning_rate": 3.35169806694634e-07, + "loss": 0.6763165, + "num_input_tokens_seen": 146951760, + "step": 6808, + "time_per_iteration": 2.418520212173462 + }, + { + "auxiliary_loss_clip": 0.01031117, + "auxiliary_loss_mlp": 0.01001583, + "balance_loss_clip": 1.01141548, + "balance_loss_mlp": 1.00076663, + "epoch": 0.8187338423615703, + "flos": 63480300675840.0, + "grad_norm": 0.7969511332793651, + "language_loss": 0.60679591, + "learning_rate": 3.3473826381662186e-07, + "loss": 0.627123, + "num_input_tokens_seen": 147022900, + "step": 6809, + "time_per_iteration": 3.213564872741699 + }, + { + "auxiliary_loss_clip": 0.0114486, + "auxiliary_loss_mlp": 0.01022537, + "balance_loss_clip": 1.04562509, + "balance_loss_mlp": 1.01572168, + "epoch": 0.8188540852522095, + "flos": 17529974006400.0, + "grad_norm": 1.808331669833302, + "language_loss": 0.8150667, + "learning_rate": 3.3430697355749216e-07, + "loss": 0.83674073, + "num_input_tokens_seen": 147040590, + "step": 6810, + "time_per_iteration": 2.410327434539795 + }, + { + "auxiliary_loss_clip": 0.01105508, + "auxiliary_loss_mlp": 0.01023655, + "balance_loss_clip": 1.04036248, + "balance_loss_mlp": 1.01603508, + "epoch": 0.8189743281428485, + "flos": 14392530702720.0, + "grad_norm": 1.8819289404063788, + "language_loss": 0.75460494, + "learning_rate": 3.3387593598266907e-07, + "loss": 0.77589655, + "num_input_tokens_seen": 147057200, + "step": 6811, + "time_per_iteration": 2.4785306453704834 + }, + { + "auxiliary_loss_clip": 0.01114251, + "auxiliary_loss_mlp": 0.01021904, + "balance_loss_clip": 1.04026806, + "balance_loss_mlp": 1.01505256, + "epoch": 0.8190945710334876, + "flos": 25080479285760.0, + "grad_norm": 2.59803691005206, + "language_loss": 0.78205597, + "learning_rate": 3.3344515115754225e-07, + "loss": 0.80341756, + "num_input_tokens_seen": 147076180, + "step": 6812, + "time_per_iteration": 2.528937816619873 + }, + { + "auxiliary_loss_clip": 0.01127628, + "auxiliary_loss_mlp": 0.0102061, + "balance_loss_clip": 1.04191816, + "balance_loss_mlp": 1.01353741, + "epoch": 0.8192148139241268, + "flos": 21507152440320.0, + "grad_norm": 3.966273827817656, + "language_loss": 0.80260217, + "learning_rate": 3.33014619147461e-07, + "loss": 0.82408452, + "num_input_tokens_seen": 147094205, + "step": 6813, + "time_per_iteration": 2.5006282329559326 + }, + { + "auxiliary_loss_clip": 0.01138862, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.04736912, + "balance_loss_mlp": 1.01959419, + "epoch": 0.8193350568147658, + "flos": 23952166289280.0, + "grad_norm": 2.9015100002381757, + "language_loss": 0.71450073, + "learning_rate": 3.325843400177362e-07, + "loss": 0.73615396, + "num_input_tokens_seen": 147115545, + "step": 6814, + "time_per_iteration": 3.3452374935150146 + }, + { + "auxiliary_loss_clip": 0.01155299, + "auxiliary_loss_mlp": 0.00761855, + "balance_loss_clip": 1.04669714, + "balance_loss_mlp": 1.00044894, + "epoch": 0.8194552997054049, + "flos": 20559469962240.0, + "grad_norm": 1.8868272918197915, + "language_loss": 0.73585165, + "learning_rate": 3.32154313833642e-07, + "loss": 0.75502312, + "num_input_tokens_seen": 147135700, + "step": 6815, + "time_per_iteration": 2.52557635307312 + }, + { + "auxiliary_loss_clip": 0.01167123, + "auxiliary_loss_mlp": 0.01025334, + "balance_loss_clip": 1.04735839, + "balance_loss_mlp": 1.01802087, + "epoch": 0.819575542596044, + "flos": 26031753123840.0, + "grad_norm": 2.701960319744234, + "language_loss": 0.59457517, + "learning_rate": 3.3172454066041164e-07, + "loss": 0.61649978, + "num_input_tokens_seen": 147155205, + "step": 6816, + "time_per_iteration": 3.199972629547119 + }, + { + "auxiliary_loss_clip": 0.01096637, + "auxiliary_loss_mlp": 0.00761462, + "balance_loss_clip": 1.04412985, + "balance_loss_mlp": 1.00040317, + "epoch": 0.8196957854866831, + "flos": 29096944220160.0, + "grad_norm": 1.8209534521393016, + "language_loss": 0.75782466, + "learning_rate": 3.3129502056324234e-07, + "loss": 0.77640569, + "num_input_tokens_seen": 147176570, + "step": 6817, + "time_per_iteration": 3.434685707092285 + }, + { + "auxiliary_loss_clip": 0.01005078, + "auxiliary_loss_mlp": 0.01001706, + "balance_loss_clip": 1.01192069, + "balance_loss_mlp": 1.00066268, + "epoch": 0.8198160283773221, + "flos": 69033631898880.0, + "grad_norm": 0.8014186045695255, + "language_loss": 0.59753567, + "learning_rate": 3.3086575360729165e-07, + "loss": 0.61760348, + "num_input_tokens_seen": 147234105, + "step": 6818, + "time_per_iteration": 3.1724495887756348 + }, + { + "auxiliary_loss_clip": 0.011364, + "auxiliary_loss_mlp": 0.01025633, + "balance_loss_clip": 1.04478908, + "balance_loss_mlp": 1.0182184, + "epoch": 0.8199362712679613, + "flos": 16618058496000.0, + "grad_norm": 1.6375991749784842, + "language_loss": 0.71092522, + "learning_rate": 3.3043673985767906e-07, + "loss": 0.73254561, + "num_input_tokens_seen": 147253170, + "step": 6819, + "time_per_iteration": 2.771613121032715 + }, + { + "auxiliary_loss_clip": 0.0111398, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.03930044, + "balance_loss_mlp": 1.01935029, + "epoch": 0.8200565141586004, + "flos": 21757664868480.0, + "grad_norm": 1.836627459465846, + "language_loss": 0.77541542, + "learning_rate": 3.3000797937948564e-07, + "loss": 0.79682302, + "num_input_tokens_seen": 147271465, + "step": 6820, + "time_per_iteration": 2.5191915035247803 + }, + { + "auxiliary_loss_clip": 0.01031906, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00739598, + "balance_loss_mlp": 1.00044334, + "epoch": 0.8201767570492394, + "flos": 69807112392960.0, + "grad_norm": 0.9465417163068718, + "language_loss": 0.65043789, + "learning_rate": 3.295794722377534e-07, + "loss": 0.6707713, + "num_input_tokens_seen": 147335070, + "step": 6821, + "time_per_iteration": 4.202096939086914 + }, + { + "auxiliary_loss_clip": 0.01161817, + "auxiliary_loss_mlp": 0.01024813, + "balance_loss_clip": 1.0443958, + "balance_loss_mlp": 1.01835465, + "epoch": 0.8202969999398786, + "flos": 23111892455040.0, + "grad_norm": 2.0523362890499146, + "language_loss": 0.7999754, + "learning_rate": 3.291512184974876e-07, + "loss": 0.82184172, + "num_input_tokens_seen": 147355460, + "step": 6822, + "time_per_iteration": 2.4443702697753906 + }, + { + "auxiliary_loss_clip": 0.01134688, + "auxiliary_loss_mlp": 0.010225, + "balance_loss_clip": 1.04119492, + "balance_loss_mlp": 1.01518691, + "epoch": 0.8204172428305176, + "flos": 28220616109440.0, + "grad_norm": 1.6446217423910388, + "language_loss": 0.66581541, + "learning_rate": 3.2872321822365346e-07, + "loss": 0.68738729, + "num_input_tokens_seen": 147375675, + "step": 6823, + "time_per_iteration": 2.5599071979522705 + }, + { + "auxiliary_loss_clip": 0.01150777, + "auxiliary_loss_mlp": 0.01020447, + "balance_loss_clip": 1.04727721, + "balance_loss_mlp": 1.01334214, + "epoch": 0.8205374857211567, + "flos": 20887011106560.0, + "grad_norm": 1.8260951084729837, + "language_loss": 0.73330677, + "learning_rate": 3.282954714811783e-07, + "loss": 0.75501901, + "num_input_tokens_seen": 147394580, + "step": 6824, + "time_per_iteration": 2.4431612491607666 + }, + { + "auxiliary_loss_clip": 0.01124304, + "auxiliary_loss_mlp": 0.01023651, + "balance_loss_clip": 1.04019284, + "balance_loss_mlp": 1.01628447, + "epoch": 0.8206577286117959, + "flos": 13152140294400.0, + "grad_norm": 2.3441282185655514, + "language_loss": 0.70823205, + "learning_rate": 3.2786797833495093e-07, + "loss": 0.72971153, + "num_input_tokens_seen": 147409935, + "step": 6825, + "time_per_iteration": 2.463106155395508 + }, + { + "auxiliary_loss_clip": 0.01163014, + "auxiliary_loss_mlp": 0.01024655, + "balance_loss_clip": 1.04668975, + "balance_loss_mlp": 1.01823604, + "epoch": 0.8207779715024349, + "flos": 25265634917760.0, + "grad_norm": 1.7494974601576794, + "language_loss": 0.7275176, + "learning_rate": 3.274407388498213e-07, + "loss": 0.7493943, + "num_input_tokens_seen": 147428065, + "step": 6826, + "time_per_iteration": 2.440802812576294 + }, + { + "auxiliary_loss_clip": 0.01118471, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.04158759, + "balance_loss_mlp": 1.02046967, + "epoch": 0.820898214393074, + "flos": 19610243199360.0, + "grad_norm": 5.975114684799946, + "language_loss": 0.73896289, + "learning_rate": 3.270137530906021e-07, + "loss": 0.7604214, + "num_input_tokens_seen": 147447300, + "step": 6827, + "time_per_iteration": 2.5104928016662598 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01023184, + "balance_loss_clip": 1.04478908, + "balance_loss_mlp": 1.01661253, + "epoch": 0.8210184572837131, + "flos": 15596615439360.0, + "grad_norm": 1.7635218594643283, + "language_loss": 0.83393168, + "learning_rate": 3.265870211220665e-07, + "loss": 0.85519004, + "num_input_tokens_seen": 147465135, + "step": 6828, + "time_per_iteration": 2.555386781692505 + }, + { + "auxiliary_loss_clip": 0.01121022, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.04428482, + "balance_loss_mlp": 1.02113509, + "epoch": 0.8211387001743522, + "flos": 20813932886400.0, + "grad_norm": 1.9315952694056193, + "language_loss": 0.81658369, + "learning_rate": 3.2616054300894934e-07, + "loss": 0.83808136, + "num_input_tokens_seen": 147484585, + "step": 6829, + "time_per_iteration": 2.6306464672088623 + }, + { + "auxiliary_loss_clip": 0.01128758, + "auxiliary_loss_mlp": 0.01021882, + "balance_loss_clip": 1.04499853, + "balance_loss_mlp": 1.01465201, + "epoch": 0.8212589430649913, + "flos": 27704579368320.0, + "grad_norm": 1.952161659050395, + "language_loss": 0.84430063, + "learning_rate": 3.2573431881594693e-07, + "loss": 0.865807, + "num_input_tokens_seen": 147504130, + "step": 6830, + "time_per_iteration": 2.5522513389587402 + }, + { + "auxiliary_loss_clip": 0.01094959, + "auxiliary_loss_mlp": 0.01022258, + "balance_loss_clip": 1.03914845, + "balance_loss_mlp": 1.01516461, + "epoch": 0.8213791859556304, + "flos": 22455625017600.0, + "grad_norm": 2.0103338968504167, + "language_loss": 0.65954649, + "learning_rate": 3.2530834860771663e-07, + "loss": 0.68071866, + "num_input_tokens_seen": 147523510, + "step": 6831, + "time_per_iteration": 2.6042492389678955 + }, + { + "auxiliary_loss_clip": 0.01151151, + "auxiliary_loss_mlp": 0.01023988, + "balance_loss_clip": 1.04412436, + "balance_loss_mlp": 1.01647198, + "epoch": 0.8214994288462695, + "flos": 16654471908480.0, + "grad_norm": 1.9991066861645834, + "language_loss": 0.74314129, + "learning_rate": 3.248826324488794e-07, + "loss": 0.7648927, + "num_input_tokens_seen": 147540805, + "step": 6832, + "time_per_iteration": 2.421898365020752 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.01026505, + "balance_loss_clip": 1.05060315, + "balance_loss_mlp": 1.0197432, + "epoch": 0.8216196717369085, + "flos": 25221787390080.0, + "grad_norm": 1.6731329061506115, + "language_loss": 0.8793807, + "learning_rate": 3.244571704040138e-07, + "loss": 0.90130591, + "num_input_tokens_seen": 147560965, + "step": 6833, + "time_per_iteration": 2.4621951580047607 + }, + { + "auxiliary_loss_clip": 0.01148958, + "auxiliary_loss_mlp": 0.01026551, + "balance_loss_clip": 1.04374933, + "balance_loss_mlp": 1.01836705, + "epoch": 0.8217399146275477, + "flos": 25371930240000.0, + "grad_norm": 2.232277605495166, + "language_loss": 0.73934895, + "learning_rate": 3.2403196253766374e-07, + "loss": 0.76110405, + "num_input_tokens_seen": 147580045, + "step": 6834, + "time_per_iteration": 2.4883499145507812 + }, + { + "auxiliary_loss_clip": 0.01148492, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.04524648, + "balance_loss_mlp": 1.01779974, + "epoch": 0.8218601575181868, + "flos": 25629625388160.0, + "grad_norm": 2.4671913219894672, + "language_loss": 0.78968859, + "learning_rate": 3.2360700891433254e-07, + "loss": 0.81142926, + "num_input_tokens_seen": 147599070, + "step": 6835, + "time_per_iteration": 2.4703404903411865 + }, + { + "auxiliary_loss_clip": 0.01023107, + "auxiliary_loss_mlp": 0.01002827, + "balance_loss_clip": 1.01077461, + "balance_loss_mlp": 1.0017724, + "epoch": 0.8219804004088258, + "flos": 67660229427840.0, + "grad_norm": 0.799394554818687, + "language_loss": 0.57317376, + "learning_rate": 3.231823095984847e-07, + "loss": 0.59343314, + "num_input_tokens_seen": 147653710, + "step": 6836, + "time_per_iteration": 3.033067226409912 + }, + { + "auxiliary_loss_clip": 0.01137238, + "auxiliary_loss_mlp": 0.01022, + "balance_loss_clip": 1.04527628, + "balance_loss_mlp": 1.01504469, + "epoch": 0.822100643299465, + "flos": 19464266327040.0, + "grad_norm": 2.012688134441855, + "language_loss": 0.76158744, + "learning_rate": 3.2275786465454814e-07, + "loss": 0.78317982, + "num_input_tokens_seen": 147670360, + "step": 6837, + "time_per_iteration": 2.4612369537353516 + }, + { + "auxiliary_loss_clip": 0.01120838, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.04266298, + "balance_loss_mlp": 1.01639128, + "epoch": 0.822220886190104, + "flos": 24681368292480.0, + "grad_norm": 2.137336169612773, + "language_loss": 0.75605828, + "learning_rate": 3.2233367414690917e-07, + "loss": 0.7774967, + "num_input_tokens_seen": 147692550, + "step": 6838, + "time_per_iteration": 2.548297643661499 + }, + { + "auxiliary_loss_clip": 0.01118648, + "auxiliary_loss_mlp": 0.01022626, + "balance_loss_clip": 1.04009938, + "balance_loss_mlp": 1.01574206, + "epoch": 0.8223411290807431, + "flos": 27819062991360.0, + "grad_norm": 2.6293256821619257, + "language_loss": 0.8494792, + "learning_rate": 3.219097381399183e-07, + "loss": 0.87089193, + "num_input_tokens_seen": 147709725, + "step": 6839, + "time_per_iteration": 2.5535924434661865 + }, + { + "auxiliary_loss_clip": 0.01143055, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.04465055, + "balance_loss_mlp": 1.01835024, + "epoch": 0.8224613719713821, + "flos": 23218546913280.0, + "grad_norm": 2.384893230331127, + "language_loss": 0.80890906, + "learning_rate": 3.2148605669788584e-07, + "loss": 0.83058888, + "num_input_tokens_seen": 147729615, + "step": 6840, + "time_per_iteration": 3.3277125358581543 + }, + { + "auxiliary_loss_clip": 0.01138229, + "auxiliary_loss_mlp": 0.0102417, + "balance_loss_clip": 1.04555702, + "balance_loss_mlp": 1.01680923, + "epoch": 0.8225816148620213, + "flos": 15706250726400.0, + "grad_norm": 2.6911656078117816, + "language_loss": 0.77285266, + "learning_rate": 3.2106262988508405e-07, + "loss": 0.79447669, + "num_input_tokens_seen": 147747665, + "step": 6841, + "time_per_iteration": 2.462916851043701 + }, + { + "auxiliary_loss_clip": 0.01139259, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.04456878, + "balance_loss_mlp": 1.01505494, + "epoch": 0.8227018577526604, + "flos": 18515111391360.0, + "grad_norm": 1.7893112431219729, + "language_loss": 0.74198681, + "learning_rate": 3.206394577657465e-07, + "loss": 0.76360011, + "num_input_tokens_seen": 147765445, + "step": 6842, + "time_per_iteration": 2.455817222595215 + }, + { + "auxiliary_loss_clip": 0.01156358, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.04799604, + "balance_loss_mlp": 1.02120948, + "epoch": 0.8228221006432994, + "flos": 22236785406720.0, + "grad_norm": 2.408334106447516, + "language_loss": 0.72702813, + "learning_rate": 3.202165404040675e-07, + "loss": 0.74887854, + "num_input_tokens_seen": 147783365, + "step": 6843, + "time_per_iteration": 3.2742409706115723 + }, + { + "auxiliary_loss_clip": 0.01097748, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.0427283, + "balance_loss_mlp": 1.02111745, + "epoch": 0.8229423435339386, + "flos": 24097532630400.0, + "grad_norm": 2.8885698183824884, + "language_loss": 0.74479699, + "learning_rate": 3.1979387786420396e-07, + "loss": 0.76606143, + "num_input_tokens_seen": 147803605, + "step": 6844, + "time_per_iteration": 2.601386308670044 + }, + { + "auxiliary_loss_clip": 0.01138487, + "auxiliary_loss_mlp": 0.01018616, + "balance_loss_clip": 1.04223073, + "balance_loss_mlp": 1.01184177, + "epoch": 0.8230625864245776, + "flos": 23878549365120.0, + "grad_norm": 1.7502135431720856, + "language_loss": 0.82327712, + "learning_rate": 3.1937147021027346e-07, + "loss": 0.84484816, + "num_input_tokens_seen": 147822060, + "step": 6845, + "time_per_iteration": 2.4975786209106445 + }, + { + "auxiliary_loss_clip": 0.01148717, + "auxiliary_loss_mlp": 0.01021213, + "balance_loss_clip": 1.04495025, + "balance_loss_mlp": 1.01484394, + "epoch": 0.8231828293152167, + "flos": 16581106379520.0, + "grad_norm": 2.4845316230889787, + "language_loss": 0.7667774, + "learning_rate": 3.189493175063547e-07, + "loss": 0.78847677, + "num_input_tokens_seen": 147839295, + "step": 6846, + "time_per_iteration": 2.4254555702209473 + }, + { + "auxiliary_loss_clip": 0.0113708, + "auxiliary_loss_mlp": 0.01024559, + "balance_loss_clip": 1.04483533, + "balance_loss_mlp": 1.01731062, + "epoch": 0.8233030722058559, + "flos": 18880071528960.0, + "grad_norm": 1.8202686101939696, + "language_loss": 0.67626613, + "learning_rate": 3.1852741981648776e-07, + "loss": 0.69788247, + "num_input_tokens_seen": 147857945, + "step": 6847, + "time_per_iteration": 2.458911418914795 + }, + { + "auxiliary_loss_clip": 0.01112208, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.04267943, + "balance_loss_mlp": 1.0218966, + "epoch": 0.8234233150964949, + "flos": 28439024757120.0, + "grad_norm": 2.729353710444371, + "language_loss": 0.69912225, + "learning_rate": 3.1810577720467404e-07, + "loss": 0.7205351, + "num_input_tokens_seen": 147879675, + "step": 6848, + "time_per_iteration": 3.3158977031707764 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01021392, + "balance_loss_clip": 1.04517186, + "balance_loss_mlp": 1.01412606, + "epoch": 0.823543557987134, + "flos": 33765941577600.0, + "grad_norm": 10.5492874034828, + "language_loss": 0.56379616, + "learning_rate": 3.176843897348769e-07, + "loss": 0.58539897, + "num_input_tokens_seen": 147902870, + "step": 6849, + "time_per_iteration": 2.606759548187256 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01023834, + "balance_loss_clip": 1.04406989, + "balance_loss_mlp": 1.01666057, + "epoch": 0.8236638008777731, + "flos": 17092366611840.0, + "grad_norm": 2.457616355858711, + "language_loss": 0.75882953, + "learning_rate": 3.1726325747102034e-07, + "loss": 0.78040683, + "num_input_tokens_seen": 147921245, + "step": 6850, + "time_per_iteration": 2.4447011947631836 + }, + { + "auxiliary_loss_clip": 0.01100901, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.03642058, + "balance_loss_mlp": 1.01736712, + "epoch": 0.8237840437684122, + "flos": 61639982334720.0, + "grad_norm": 1.4723497310501072, + "language_loss": 0.64271373, + "learning_rate": 3.1684238047698974e-07, + "loss": 0.66396886, + "num_input_tokens_seen": 147949515, + "step": 6851, + "time_per_iteration": 2.912355661392212 + }, + { + "auxiliary_loss_clip": 0.01140571, + "auxiliary_loss_mlp": 0.01026097, + "balance_loss_clip": 1.04617202, + "balance_loss_mlp": 1.01889682, + "epoch": 0.8239042866590512, + "flos": 27309023821440.0, + "grad_norm": 2.1365551628339623, + "language_loss": 0.53172874, + "learning_rate": 3.1642175881663155e-07, + "loss": 0.55339539, + "num_input_tokens_seen": 147969245, + "step": 6852, + "time_per_iteration": 2.53395938873291 + }, + { + "auxiliary_loss_clip": 0.01162797, + "auxiliary_loss_mlp": 0.01021581, + "balance_loss_clip": 1.04615116, + "balance_loss_mlp": 1.01508379, + "epoch": 0.8240245295496904, + "flos": 21726351187200.0, + "grad_norm": 2.023049428612565, + "language_loss": 0.83929038, + "learning_rate": 3.160013925537537e-07, + "loss": 0.86113411, + "num_input_tokens_seen": 147990080, + "step": 6853, + "time_per_iteration": 2.416192054748535 + }, + { + "auxiliary_loss_clip": 0.01123604, + "auxiliary_loss_mlp": 0.01024113, + "balance_loss_clip": 1.04085815, + "balance_loss_mlp": 1.01683879, + "epoch": 0.8241447724403295, + "flos": 20009318279040.0, + "grad_norm": 7.111965539277004, + "language_loss": 0.75757122, + "learning_rate": 3.155812817521266e-07, + "loss": 0.77904838, + "num_input_tokens_seen": 148010455, + "step": 6854, + "time_per_iteration": 2.501901388168335 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.04483306, + "balance_loss_mlp": 1.01877356, + "epoch": 0.8242650153309685, + "flos": 22272983337600.0, + "grad_norm": 2.017624505195568, + "language_loss": 0.78218204, + "learning_rate": 3.151614264754787e-07, + "loss": 0.80382371, + "num_input_tokens_seen": 148028400, + "step": 6855, + "time_per_iteration": 2.4778809547424316 + }, + { + "auxiliary_loss_clip": 0.01164009, + "auxiliary_loss_mlp": 0.01024597, + "balance_loss_clip": 1.04513669, + "balance_loss_mlp": 1.01758194, + "epoch": 0.8243852582216077, + "flos": 22309971367680.0, + "grad_norm": 2.4675703312436514, + "language_loss": 0.79649591, + "learning_rate": 3.147418267875035e-07, + "loss": 0.81838197, + "num_input_tokens_seen": 148046530, + "step": 6856, + "time_per_iteration": 2.4160687923431396 + }, + { + "auxiliary_loss_clip": 0.01091824, + "auxiliary_loss_mlp": 0.00761465, + "balance_loss_clip": 1.03800154, + "balance_loss_mlp": 1.00042939, + "epoch": 0.8245055011122467, + "flos": 24645421756800.0, + "grad_norm": 2.167277393445466, + "language_loss": 0.65251005, + "learning_rate": 3.1432248275185315e-07, + "loss": 0.67104286, + "num_input_tokens_seen": 148067040, + "step": 6857, + "time_per_iteration": 2.6238596439361572 + }, + { + "auxiliary_loss_clip": 0.01150927, + "auxiliary_loss_mlp": 0.01023067, + "balance_loss_clip": 1.04731131, + "balance_loss_mlp": 1.01618898, + "epoch": 0.8246257440028858, + "flos": 17487275713920.0, + "grad_norm": 2.049562297499048, + "language_loss": 0.76828903, + "learning_rate": 3.139033944321412e-07, + "loss": 0.79002899, + "num_input_tokens_seen": 148084400, + "step": 6858, + "time_per_iteration": 2.4194962978363037 + }, + { + "auxiliary_loss_clip": 0.01152845, + "auxiliary_loss_mlp": 0.01021198, + "balance_loss_clip": 1.04540741, + "balance_loss_mlp": 1.01443911, + "epoch": 0.824745986893525, + "flos": 25010130499200.0, + "grad_norm": 1.639518671723465, + "language_loss": 0.79117405, + "learning_rate": 3.1348456189194507e-07, + "loss": 0.81291449, + "num_input_tokens_seen": 148104860, + "step": 6859, + "time_per_iteration": 2.4707069396972656 + }, + { + "auxiliary_loss_clip": 0.01112755, + "auxiliary_loss_mlp": 0.01021608, + "balance_loss_clip": 1.03932667, + "balance_loss_mlp": 1.01434231, + "epoch": 0.824866229784164, + "flos": 18772698798720.0, + "grad_norm": 1.7934963059501927, + "language_loss": 0.83034438, + "learning_rate": 3.1306598519479876e-07, + "loss": 0.85168803, + "num_input_tokens_seen": 148124680, + "step": 6860, + "time_per_iteration": 2.5230376720428467 + }, + { + "auxiliary_loss_clip": 0.01131201, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.04248166, + "balance_loss_mlp": 1.01595318, + "epoch": 0.8249864726748031, + "flos": 23842171866240.0, + "grad_norm": 3.4669129118230835, + "language_loss": 0.78268087, + "learning_rate": 3.1264766440420177e-07, + "loss": 0.80421793, + "num_input_tokens_seen": 148147150, + "step": 6861, + "time_per_iteration": 2.5132832527160645 + }, + { + "auxiliary_loss_clip": 0.01149082, + "auxiliary_loss_mlp": 0.01022166, + "balance_loss_clip": 1.04645252, + "balance_loss_mlp": 1.01529336, + "epoch": 0.8251067155654422, + "flos": 20303103617280.0, + "grad_norm": 2.0856019670899046, + "language_loss": 0.69264448, + "learning_rate": 3.122295995836124e-07, + "loss": 0.71435696, + "num_input_tokens_seen": 148167020, + "step": 6862, + "time_per_iteration": 2.4347610473632812 + }, + { + "auxiliary_loss_clip": 0.01154201, + "auxiliary_loss_mlp": 0.01021424, + "balance_loss_clip": 1.04321921, + "balance_loss_mlp": 1.01381302, + "epoch": 0.8252269584560813, + "flos": 25009699536000.0, + "grad_norm": 1.7768888128862097, + "language_loss": 0.77480811, + "learning_rate": 3.118117907964508e-07, + "loss": 0.7965644, + "num_input_tokens_seen": 148188965, + "step": 6863, + "time_per_iteration": 2.5595972537994385 + }, + { + "auxiliary_loss_clip": 0.01129612, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.04345632, + "balance_loss_mlp": 1.01807952, + "epoch": 0.8253472013467203, + "flos": 17128564542720.0, + "grad_norm": 1.8906899471746743, + "language_loss": 0.80131149, + "learning_rate": 3.1139423810609856e-07, + "loss": 0.82285714, + "num_input_tokens_seen": 148205660, + "step": 6864, + "time_per_iteration": 2.5707077980041504 + }, + { + "auxiliary_loss_clip": 0.01162816, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.04380798, + "balance_loss_mlp": 1.01819897, + "epoch": 0.8254674442373595, + "flos": 22414794232320.0, + "grad_norm": 1.8437331759961366, + "language_loss": 0.75427389, + "learning_rate": 3.1097694157589714e-07, + "loss": 0.77615833, + "num_input_tokens_seen": 148225545, + "step": 6865, + "time_per_iteration": 2.445305585861206 + }, + { + "auxiliary_loss_clip": 0.01148257, + "auxiliary_loss_mlp": 0.01027076, + "balance_loss_clip": 1.04637527, + "balance_loss_mlp": 1.01984286, + "epoch": 0.8255876871279986, + "flos": 24786765774720.0, + "grad_norm": 2.964809227991256, + "language_loss": 0.76846826, + "learning_rate": 3.105599012691511e-07, + "loss": 0.79022163, + "num_input_tokens_seen": 148243975, + "step": 6866, + "time_per_iteration": 2.467745780944824 + }, + { + "auxiliary_loss_clip": 0.01147489, + "auxiliary_loss_mlp": 0.01023943, + "balance_loss_clip": 1.04500008, + "balance_loss_mlp": 1.01710033, + "epoch": 0.8257079300186376, + "flos": 27455431656960.0, + "grad_norm": 1.4302936193903149, + "language_loss": 0.82487613, + "learning_rate": 3.101431172491249e-07, + "loss": 0.84659046, + "num_input_tokens_seen": 148265520, + "step": 6867, + "time_per_iteration": 3.3579022884368896 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.00761875, + "balance_loss_clip": 1.04161596, + "balance_loss_mlp": 1.00042987, + "epoch": 0.8258281729092768, + "flos": 16471866142080.0, + "grad_norm": 2.126919550353325, + "language_loss": 0.71970677, + "learning_rate": 3.097265895790444e-07, + "loss": 0.73858654, + "num_input_tokens_seen": 148283730, + "step": 6868, + "time_per_iteration": 2.537630558013916 + }, + { + "auxiliary_loss_clip": 0.01123445, + "auxiliary_loss_mlp": 0.01021809, + "balance_loss_clip": 1.0421474, + "balance_loss_mlp": 1.01530576, + "epoch": 0.8259484157999158, + "flos": 21433822824960.0, + "grad_norm": 2.618214932771576, + "language_loss": 0.83413196, + "learning_rate": 3.093103183220962e-07, + "loss": 0.85558456, + "num_input_tokens_seen": 148303775, + "step": 6869, + "time_per_iteration": 3.3767173290252686 + }, + { + "auxiliary_loss_clip": 0.01053424, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 1.00961125, + "balance_loss_mlp": 0.99995059, + "epoch": 0.8260686586905549, + "flos": 58322342453760.0, + "grad_norm": 0.8200754910891466, + "language_loss": 0.5940001, + "learning_rate": 3.0889430354142796e-07, + "loss": 0.61454308, + "num_input_tokens_seen": 148365285, + "step": 6870, + "time_per_iteration": 3.014164924621582 + }, + { + "auxiliary_loss_clip": 0.01126464, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.04191077, + "balance_loss_mlp": 1.01742935, + "epoch": 0.826188901581194, + "flos": 27527288814720.0, + "grad_norm": 1.9459485876219063, + "language_loss": 0.69888842, + "learning_rate": 3.084785453001497e-07, + "loss": 0.72040039, + "num_input_tokens_seen": 148386200, + "step": 6871, + "time_per_iteration": 2.5698933601379395 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.00761863, + "balance_loss_clip": 1.04683053, + "balance_loss_mlp": 1.00045538, + "epoch": 0.8263091444718331, + "flos": 23696051339520.0, + "grad_norm": 2.13352818553947, + "language_loss": 0.82282352, + "learning_rate": 3.080630436613314e-07, + "loss": 0.84181774, + "num_input_tokens_seen": 148403970, + "step": 6872, + "time_per_iteration": 2.4855942726135254 + }, + { + "auxiliary_loss_clip": 0.0114296, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.04358542, + "balance_loss_mlp": 1.02146769, + "epoch": 0.8264293873624722, + "flos": 17165157523200.0, + "grad_norm": 2.547302559084706, + "language_loss": 0.86318392, + "learning_rate": 3.076477986880039e-07, + "loss": 0.88489974, + "num_input_tokens_seen": 148421765, + "step": 6873, + "time_per_iteration": 2.4098455905914307 + }, + { + "auxiliary_loss_clip": 0.01135802, + "auxiliary_loss_mlp": 0.01021283, + "balance_loss_clip": 1.04510796, + "balance_loss_mlp": 1.01429129, + "epoch": 0.8265496302531112, + "flos": 24098645952000.0, + "grad_norm": 2.3262202204578513, + "language_loss": 0.69370931, + "learning_rate": 3.0723281044315986e-07, + "loss": 0.71528018, + "num_input_tokens_seen": 148443720, + "step": 6874, + "time_per_iteration": 3.2196807861328125 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01023775, + "balance_loss_clip": 1.04345608, + "balance_loss_mlp": 1.01721573, + "epoch": 0.8266698731437504, + "flos": 14099894599680.0, + "grad_norm": 2.0246735928259816, + "language_loss": 0.76180756, + "learning_rate": 3.068180789897521e-07, + "loss": 0.78362954, + "num_input_tokens_seen": 148462130, + "step": 6875, + "time_per_iteration": 2.3761699199676514 + }, + { + "auxiliary_loss_clip": 0.01154858, + "auxiliary_loss_mlp": 0.01022623, + "balance_loss_clip": 1.04526854, + "balance_loss_mlp": 1.01517797, + "epoch": 0.8267901160343895, + "flos": 30777563715840.0, + "grad_norm": 1.4361712172831118, + "language_loss": 0.81562686, + "learning_rate": 3.064036043906966e-07, + "loss": 0.83740163, + "num_input_tokens_seen": 148485570, + "step": 6876, + "time_per_iteration": 2.5402393341064453 + }, + { + "auxiliary_loss_clip": 0.01129314, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.04240394, + "balance_loss_mlp": 1.01782906, + "epoch": 0.8269103589250285, + "flos": 40624915242240.0, + "grad_norm": 2.376346023107187, + "language_loss": 0.68122613, + "learning_rate": 3.059893867088668e-07, + "loss": 0.70277345, + "num_input_tokens_seen": 148509715, + "step": 6877, + "time_per_iteration": 2.6738810539245605 + }, + { + "auxiliary_loss_clip": 0.01147215, + "auxiliary_loss_mlp": 0.01024467, + "balance_loss_clip": 1.04483867, + "balance_loss_mlp": 1.01801217, + "epoch": 0.8270306018156677, + "flos": 30263645877120.0, + "grad_norm": 2.17552021079681, + "language_loss": 0.66741371, + "learning_rate": 3.055754260071004e-07, + "loss": 0.68913054, + "num_input_tokens_seen": 148532010, + "step": 6878, + "time_per_iteration": 2.506004571914673 + }, + { + "auxiliary_loss_clip": 0.01150462, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.04599977, + "balance_loss_mlp": 1.01986313, + "epoch": 0.8271508447063067, + "flos": 25226599812480.0, + "grad_norm": 4.579153493549264, + "language_loss": 0.7360543, + "learning_rate": 3.051617223481948e-07, + "loss": 0.75782418, + "num_input_tokens_seen": 148553330, + "step": 6879, + "time_per_iteration": 2.4655911922454834 + }, + { + "auxiliary_loss_clip": 0.01134737, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.04430628, + "balance_loss_mlp": 1.02197146, + "epoch": 0.8272710875969458, + "flos": 17566602900480.0, + "grad_norm": 2.055779408794126, + "language_loss": 0.75496978, + "learning_rate": 3.047482757949078e-07, + "loss": 0.7766124, + "num_input_tokens_seen": 148570960, + "step": 6880, + "time_per_iteration": 2.485534429550171 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.00761293, + "balance_loss_clip": 1.03960729, + "balance_loss_mlp": 1.00044405, + "epoch": 0.827391330487585, + "flos": 19755465886080.0, + "grad_norm": 2.143221753535972, + "language_loss": 0.85831439, + "learning_rate": 3.043350864099605e-07, + "loss": 0.87709337, + "num_input_tokens_seen": 148589520, + "step": 6881, + "time_per_iteration": 2.491403102874756 + }, + { + "auxiliary_loss_clip": 0.01152137, + "auxiliary_loss_mlp": 0.01022256, + "balance_loss_clip": 1.04414189, + "balance_loss_mlp": 1.01538384, + "epoch": 0.827511573378224, + "flos": 16835174254080.0, + "grad_norm": 2.165174932197664, + "language_loss": 0.8078407, + "learning_rate": 3.039221542560315e-07, + "loss": 0.82958466, + "num_input_tokens_seen": 148606085, + "step": 6882, + "time_per_iteration": 2.398483991622925 + }, + { + "auxiliary_loss_clip": 0.0115022, + "auxiliary_loss_mlp": 0.01021935, + "balance_loss_clip": 1.04638052, + "balance_loss_mlp": 1.0149138, + "epoch": 0.8276318162688631, + "flos": 18369242259840.0, + "grad_norm": 2.210022440403164, + "language_loss": 0.73260939, + "learning_rate": 3.0350947939576356e-07, + "loss": 0.75433099, + "num_input_tokens_seen": 148625240, + "step": 6883, + "time_per_iteration": 2.4215855598449707 + }, + { + "auxiliary_loss_clip": 0.01156976, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.04678643, + "balance_loss_mlp": 1.01920927, + "epoch": 0.8277520591595022, + "flos": 19352691705600.0, + "grad_norm": 1.5453395721510128, + "language_loss": 0.72249901, + "learning_rate": 3.0309706189175876e-07, + "loss": 0.74433929, + "num_input_tokens_seen": 148645075, + "step": 6884, + "time_per_iteration": 2.438347339630127 + }, + { + "auxiliary_loss_clip": 0.01044694, + "auxiliary_loss_mlp": 0.01001846, + "balance_loss_clip": 1.0100615, + "balance_loss_mlp": 1.00084484, + "epoch": 0.8278723020501413, + "flos": 67918858329600.0, + "grad_norm": 0.7623672521857662, + "language_loss": 0.57401317, + "learning_rate": 3.0268490180658045e-07, + "loss": 0.59447861, + "num_input_tokens_seen": 148707855, + "step": 6885, + "time_per_iteration": 3.0473792552948 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01022055, + "balance_loss_clip": 1.04985881, + "balance_loss_mlp": 1.01508081, + "epoch": 0.8279925449407803, + "flos": 18185738653440.0, + "grad_norm": 2.4562757559457586, + "language_loss": 0.7903136, + "learning_rate": 3.0227299920275305e-07, + "loss": 0.81222963, + "num_input_tokens_seen": 148724170, + "step": 6886, + "time_per_iteration": 2.370415210723877 + }, + { + "auxiliary_loss_clip": 0.01128666, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.04585683, + "balance_loss_mlp": 1.02101135, + "epoch": 0.8281127878314195, + "flos": 20631434860800.0, + "grad_norm": 1.8884400687405891, + "language_loss": 0.85513437, + "learning_rate": 3.018613541427613e-07, + "loss": 0.87670708, + "num_input_tokens_seen": 148743690, + "step": 6887, + "time_per_iteration": 2.5087547302246094 + }, + { + "auxiliary_loss_clip": 0.0116309, + "auxiliary_loss_mlp": 0.01024937, + "balance_loss_clip": 1.04531491, + "balance_loss_mlp": 1.01805305, + "epoch": 0.8282330307220586, + "flos": 18004282122240.0, + "grad_norm": 1.6714680723685145, + "language_loss": 0.73599643, + "learning_rate": 3.0144996668905243e-07, + "loss": 0.75787669, + "num_input_tokens_seen": 148761070, + "step": 6888, + "time_per_iteration": 2.4022560119628906 + }, + { + "auxiliary_loss_clip": 0.01097036, + "auxiliary_loss_mlp": 0.0076154, + "balance_loss_clip": 1.03731155, + "balance_loss_mlp": 1.00039434, + "epoch": 0.8283532736126976, + "flos": 20084120352000.0, + "grad_norm": 3.183612837149206, + "language_loss": 0.82138085, + "learning_rate": 3.010388369040331e-07, + "loss": 0.83996665, + "num_input_tokens_seen": 148779730, + "step": 6889, + "time_per_iteration": 2.60994291305542 + }, + { + "auxiliary_loss_clip": 0.01152733, + "auxiliary_loss_mlp": 0.01024846, + "balance_loss_clip": 1.04653656, + "balance_loss_mlp": 1.01799452, + "epoch": 0.8284735165033368, + "flos": 31868421805440.0, + "grad_norm": 1.6621977960126766, + "language_loss": 0.82844639, + "learning_rate": 3.0062796485007156e-07, + "loss": 0.85022223, + "num_input_tokens_seen": 148800670, + "step": 6890, + "time_per_iteration": 2.541858196258545 + }, + { + "auxiliary_loss_clip": 0.01164281, + "auxiliary_loss_mlp": 0.00761771, + "balance_loss_clip": 1.04595208, + "balance_loss_mlp": 1.00039065, + "epoch": 0.8285937593939758, + "flos": 26651319840000.0, + "grad_norm": 3.8447576595839563, + "language_loss": 0.65936232, + "learning_rate": 3.002173505894965e-07, + "loss": 0.67862284, + "num_input_tokens_seen": 148819820, + "step": 6891, + "time_per_iteration": 2.4771533012390137 + }, + { + "auxiliary_loss_clip": 0.01154038, + "auxiliary_loss_mlp": 0.01023016, + "balance_loss_clip": 1.04346776, + "balance_loss_mlp": 1.01531482, + "epoch": 0.8287140022846149, + "flos": 20193683811840.0, + "grad_norm": 9.186926026801611, + "language_loss": 0.62150168, + "learning_rate": 2.998069941845973e-07, + "loss": 0.64327222, + "num_input_tokens_seen": 148838890, + "step": 6892, + "time_per_iteration": 2.45940899848938 + }, + { + "auxiliary_loss_clip": 0.01062982, + "auxiliary_loss_mlp": 0.01001451, + "balance_loss_clip": 1.00901604, + "balance_loss_mlp": 1.00048542, + "epoch": 0.8288342451752541, + "flos": 70755980019840.0, + "grad_norm": 0.7093223707026493, + "language_loss": 0.57449889, + "learning_rate": 2.993968956976258e-07, + "loss": 0.5951432, + "num_input_tokens_seen": 148906635, + "step": 6893, + "time_per_iteration": 3.120269775390625 + }, + { + "auxiliary_loss_clip": 0.01170626, + "auxiliary_loss_mlp": 0.01023406, + "balance_loss_clip": 1.04820871, + "balance_loss_mlp": 1.01537752, + "epoch": 0.8289544880658931, + "flos": 24572235795840.0, + "grad_norm": 1.7797692418218232, + "language_loss": 0.70151687, + "learning_rate": 2.9898705519079313e-07, + "loss": 0.72345722, + "num_input_tokens_seen": 148925740, + "step": 6894, + "time_per_iteration": 3.3278324604034424 + }, + { + "auxiliary_loss_clip": 0.01130049, + "auxiliary_loss_mlp": 0.01021247, + "balance_loss_clip": 1.04206598, + "balance_loss_mlp": 1.01422596, + "epoch": 0.8290747309565322, + "flos": 22273378387200.0, + "grad_norm": 1.8173374357746075, + "language_loss": 0.74415332, + "learning_rate": 2.985774727262715e-07, + "loss": 0.76566625, + "num_input_tokens_seen": 148944585, + "step": 6895, + "time_per_iteration": 2.473827838897705 + }, + { + "auxiliary_loss_clip": 0.01163261, + "auxiliary_loss_mlp": 0.01023308, + "balance_loss_clip": 1.04610658, + "balance_loss_mlp": 1.01673973, + "epoch": 0.8291949738471713, + "flos": 23255570856960.0, + "grad_norm": 1.8623617111640176, + "language_loss": 0.81484956, + "learning_rate": 2.981681483661949e-07, + "loss": 0.83671522, + "num_input_tokens_seen": 148964170, + "step": 6896, + "time_per_iteration": 3.122509717941284 + }, + { + "auxiliary_loss_clip": 0.01152999, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.04818177, + "balance_loss_mlp": 1.02474427, + "epoch": 0.8293152167378104, + "flos": 52555768185600.0, + "grad_norm": 1.7380999878035768, + "language_loss": 0.71114653, + "learning_rate": 2.9775908217265633e-07, + "loss": 0.7329914, + "num_input_tokens_seen": 148989405, + "step": 6897, + "time_per_iteration": 3.413006544113159 + }, + { + "auxiliary_loss_clip": 0.01008657, + "auxiliary_loss_mlp": 0.01002649, + "balance_loss_clip": 1.00801134, + "balance_loss_mlp": 1.00151622, + "epoch": 0.8294354596284494, + "flos": 63356156294400.0, + "grad_norm": 0.8240186140067517, + "language_loss": 0.50400698, + "learning_rate": 2.9735027420771253e-07, + "loss": 0.52412009, + "num_input_tokens_seen": 149049740, + "step": 6898, + "time_per_iteration": 3.0846452713012695 + }, + { + "auxiliary_loss_clip": 0.01133749, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.04777813, + "balance_loss_mlp": 1.02184868, + "epoch": 0.8295557025190886, + "flos": 24827021942400.0, + "grad_norm": 1.7845273763816505, + "language_loss": 0.71218121, + "learning_rate": 2.969417245333774e-07, + "loss": 0.73380172, + "num_input_tokens_seen": 149069120, + "step": 6899, + "time_per_iteration": 2.501249074935913 + }, + { + "auxiliary_loss_clip": 0.01121774, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.04458678, + "balance_loss_mlp": 1.01688242, + "epoch": 0.8296759454097277, + "flos": 25118580637440.0, + "grad_norm": 2.0046030871539307, + "language_loss": 0.77855927, + "learning_rate": 2.9653343321162915e-07, + "loss": 0.80001408, + "num_input_tokens_seen": 149088630, + "step": 6900, + "time_per_iteration": 2.515753984451294 + }, + { + "auxiliary_loss_clip": 0.01125178, + "auxiliary_loss_mlp": 0.01021405, + "balance_loss_clip": 1.04566622, + "balance_loss_mlp": 1.01363289, + "epoch": 0.8297961883003667, + "flos": 24132581326080.0, + "grad_norm": 4.005801740255351, + "language_loss": 0.65104586, + "learning_rate": 2.9612540030440446e-07, + "loss": 0.67251164, + "num_input_tokens_seen": 149109175, + "step": 6901, + "time_per_iteration": 3.291018486022949 + }, + { + "auxiliary_loss_clip": 0.01042717, + "auxiliary_loss_mlp": 0.01001102, + "balance_loss_clip": 1.00893545, + "balance_loss_mlp": 1.00011218, + "epoch": 0.8299164311910058, + "flos": 67446561375360.0, + "grad_norm": 0.8973955168334014, + "language_loss": 0.64099193, + "learning_rate": 2.9571762587360206e-07, + "loss": 0.66143012, + "num_input_tokens_seen": 149165560, + "step": 6902, + "time_per_iteration": 3.005819320678711 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01024169, + "balance_loss_clip": 1.03537178, + "balance_loss_mlp": 1.01749635, + "epoch": 0.8300366740816449, + "flos": 25228682801280.0, + "grad_norm": 1.796975921297616, + "language_loss": 0.74119967, + "learning_rate": 2.953101099810806e-07, + "loss": 0.76248074, + "num_input_tokens_seen": 149185165, + "step": 6903, + "time_per_iteration": 2.5984020233154297 + }, + { + "auxiliary_loss_clip": 0.01145991, + "auxiliary_loss_mlp": 0.01025332, + "balance_loss_clip": 1.046229, + "balance_loss_mlp": 1.01837027, + "epoch": 0.830156916972284, + "flos": 18041018757120.0, + "grad_norm": 2.4255223080633463, + "language_loss": 0.82811439, + "learning_rate": 2.9490285268865965e-07, + "loss": 0.84982765, + "num_input_tokens_seen": 149202655, + "step": 6904, + "time_per_iteration": 2.437838077545166 + }, + { + "auxiliary_loss_clip": 0.01156831, + "auxiliary_loss_mlp": 0.01019836, + "balance_loss_clip": 1.04906797, + "balance_loss_mlp": 1.01275253, + "epoch": 0.830277159862923, + "flos": 26322485806080.0, + "grad_norm": 2.039131686130138, + "language_loss": 0.79322606, + "learning_rate": 2.9449585405812085e-07, + "loss": 0.81499279, + "num_input_tokens_seen": 149220035, + "step": 6905, + "time_per_iteration": 2.4953598976135254 + }, + { + "auxiliary_loss_clip": 0.01129367, + "auxiliary_loss_mlp": 0.01019925, + "balance_loss_clip": 1.04520488, + "balance_loss_mlp": 1.01289141, + "epoch": 0.8303974027535622, + "flos": 19938861751680.0, + "grad_norm": 1.9171193094958274, + "language_loss": 0.73835111, + "learning_rate": 2.940891141512043e-07, + "loss": 0.75984395, + "num_input_tokens_seen": 149238055, + "step": 6906, + "time_per_iteration": 2.5078468322753906 + }, + { + "auxiliary_loss_clip": 0.01136621, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.04362547, + "balance_loss_mlp": 1.01843262, + "epoch": 0.8305176456442013, + "flos": 17165552572800.0, + "grad_norm": 3.06073748522985, + "language_loss": 0.71972716, + "learning_rate": 2.9368263302961385e-07, + "loss": 0.74135506, + "num_input_tokens_seen": 149256755, + "step": 6907, + "time_per_iteration": 2.4663076400756836 + }, + { + "auxiliary_loss_clip": 0.01096808, + "auxiliary_loss_mlp": 0.01019424, + "balance_loss_clip": 1.03937244, + "balance_loss_mlp": 1.01244736, + "epoch": 0.8306378885348403, + "flos": 25627614226560.0, + "grad_norm": 1.9844074649284424, + "language_loss": 0.80027926, + "learning_rate": 2.9327641075501075e-07, + "loss": 0.82144165, + "num_input_tokens_seen": 149275745, + "step": 6908, + "time_per_iteration": 2.6545732021331787 + }, + { + "auxiliary_loss_clip": 0.01129501, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.04019618, + "balance_loss_mlp": 1.0232538, + "epoch": 0.8307581314254795, + "flos": 33947864985600.0, + "grad_norm": 2.362089000669139, + "language_loss": 0.66148579, + "learning_rate": 2.9287044738901866e-07, + "loss": 0.68309051, + "num_input_tokens_seen": 149293730, + "step": 6909, + "time_per_iteration": 2.5797693729400635 + }, + { + "auxiliary_loss_clip": 0.01152017, + "auxiliary_loss_mlp": 0.00761515, + "balance_loss_clip": 1.04486823, + "balance_loss_mlp": 1.00038576, + "epoch": 0.8308783743161186, + "flos": 17562724231680.0, + "grad_norm": 1.860933637122064, + "language_loss": 0.90652752, + "learning_rate": 2.9246474299322274e-07, + "loss": 0.92566288, + "num_input_tokens_seen": 149309290, + "step": 6910, + "time_per_iteration": 2.4368836879730225 + }, + { + "auxiliary_loss_clip": 0.0102841, + "auxiliary_loss_mlp": 0.01002087, + "balance_loss_clip": 1.0083518, + "balance_loss_mlp": 1.00103843, + "epoch": 0.8309986172067576, + "flos": 69412885649280.0, + "grad_norm": 0.8961285096441381, + "language_loss": 0.63182837, + "learning_rate": 2.920592976291678e-07, + "loss": 0.65213335, + "num_input_tokens_seen": 149366620, + "step": 6911, + "time_per_iteration": 3.0481698513031006 + }, + { + "auxiliary_loss_clip": 0.01150029, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.04512262, + "balance_loss_mlp": 1.020208, + "epoch": 0.8311188600973968, + "flos": 22309755886080.0, + "grad_norm": 1.8766682038346312, + "language_loss": 0.80626452, + "learning_rate": 2.916541113583595e-07, + "loss": 0.82803929, + "num_input_tokens_seen": 149385120, + "step": 6912, + "time_per_iteration": 2.4464707374572754 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01023349, + "balance_loss_clip": 1.04649973, + "balance_loss_mlp": 1.01607394, + "epoch": 0.8312391029880358, + "flos": 18770077105920.0, + "grad_norm": 2.5345308066752987, + "language_loss": 0.66431165, + "learning_rate": 2.912491842422642e-07, + "loss": 0.68583763, + "num_input_tokens_seen": 149402825, + "step": 6913, + "time_per_iteration": 2.5723748207092285 + }, + { + "auxiliary_loss_clip": 0.01152995, + "auxiliary_loss_mlp": 0.01023288, + "balance_loss_clip": 1.04673028, + "balance_loss_mlp": 1.01606369, + "epoch": 0.8313593458786749, + "flos": 20376648714240.0, + "grad_norm": 1.7016825346540787, + "language_loss": 0.70998251, + "learning_rate": 2.9084451634230857e-07, + "loss": 0.73174536, + "num_input_tokens_seen": 149422125, + "step": 6914, + "time_per_iteration": 2.456085205078125 + }, + { + "auxiliary_loss_clip": 0.01121059, + "auxiliary_loss_mlp": 0.01027056, + "balance_loss_clip": 1.04170632, + "balance_loss_mlp": 1.0198977, + "epoch": 0.831479588769314, + "flos": 32124069878400.0, + "grad_norm": 2.970555828884649, + "language_loss": 0.71587288, + "learning_rate": 2.9044010771988125e-07, + "loss": 0.73735398, + "num_input_tokens_seen": 149441940, + "step": 6915, + "time_per_iteration": 2.627363443374634 + }, + { + "auxiliary_loss_clip": 0.01130238, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.04355609, + "balance_loss_mlp": 1.01846337, + "epoch": 0.8315998316599531, + "flos": 45185929338240.0, + "grad_norm": 1.709565035641112, + "language_loss": 0.72039658, + "learning_rate": 2.900359584363303e-07, + "loss": 0.7419529, + "num_input_tokens_seen": 149465045, + "step": 6916, + "time_per_iteration": 2.6957600116729736 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.04366982, + "balance_loss_mlp": 1.02343297, + "epoch": 0.8317200745505922, + "flos": 18363747479040.0, + "grad_norm": 2.0739871062153847, + "language_loss": 0.84478104, + "learning_rate": 2.8963206855296494e-07, + "loss": 0.86616528, + "num_input_tokens_seen": 149481285, + "step": 6917, + "time_per_iteration": 2.498197078704834 + }, + { + "auxiliary_loss_clip": 0.01151923, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.04496694, + "balance_loss_mlp": 1.02136922, + "epoch": 0.8318403174412313, + "flos": 24206557386240.0, + "grad_norm": 2.164780494814873, + "language_loss": 0.7699967, + "learning_rate": 2.892284381310548e-07, + "loss": 0.79180259, + "num_input_tokens_seen": 149502700, + "step": 6918, + "time_per_iteration": 2.459580898284912 + }, + { + "auxiliary_loss_clip": 0.01131532, + "auxiliary_loss_mlp": 0.01025444, + "balance_loss_clip": 1.04203439, + "balance_loss_mlp": 1.01781154, + "epoch": 0.8319605603318704, + "flos": 22418780641920.0, + "grad_norm": 2.5551873929896742, + "language_loss": 0.72564185, + "learning_rate": 2.888250672318302e-07, + "loss": 0.74721158, + "num_input_tokens_seen": 149520100, + "step": 6919, + "time_per_iteration": 2.4520742893218994 + }, + { + "auxiliary_loss_clip": 0.01169981, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.05039489, + "balance_loss_mlp": 1.02141023, + "epoch": 0.8320808032225094, + "flos": 37414501459200.0, + "grad_norm": 1.7787985402648339, + "language_loss": 0.6867522, + "learning_rate": 2.884219559164831e-07, + "loss": 0.70873618, + "num_input_tokens_seen": 149543245, + "step": 6920, + "time_per_iteration": 2.5364813804626465 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01022845, + "balance_loss_clip": 1.04656553, + "balance_loss_mlp": 1.01604974, + "epoch": 0.8322010461131486, + "flos": 12787395638400.0, + "grad_norm": 1.9305490846526931, + "language_loss": 0.81247079, + "learning_rate": 2.880191042461635e-07, + "loss": 0.83420676, + "num_input_tokens_seen": 149559185, + "step": 6921, + "time_per_iteration": 3.244818925857544 + }, + { + "auxiliary_loss_clip": 0.01113086, + "auxiliary_loss_mlp": 0.01022549, + "balance_loss_clip": 1.04098594, + "balance_loss_mlp": 1.01606703, + "epoch": 0.8323212890037877, + "flos": 15815455050240.0, + "grad_norm": 1.64007199583088, + "language_loss": 0.79980624, + "learning_rate": 2.876165122819849e-07, + "loss": 0.82116258, + "num_input_tokens_seen": 149577165, + "step": 6922, + "time_per_iteration": 2.516418933868408 + }, + { + "auxiliary_loss_clip": 0.01163232, + "auxiliary_loss_mlp": 0.0101988, + "balance_loss_clip": 1.04628515, + "balance_loss_mlp": 1.0132308, + "epoch": 0.8324415318944267, + "flos": 21719276208000.0, + "grad_norm": 1.7273990896534759, + "language_loss": 0.79559994, + "learning_rate": 2.872141800850201e-07, + "loss": 0.81743103, + "num_input_tokens_seen": 149594340, + "step": 6923, + "time_per_iteration": 3.223858118057251 + }, + { + "auxiliary_loss_clip": 0.01164221, + "auxiliary_loss_mlp": 0.01023564, + "balance_loss_clip": 1.04692876, + "balance_loss_mlp": 1.01726723, + "epoch": 0.8325617747850659, + "flos": 34198700636160.0, + "grad_norm": 2.0780732159364854, + "language_loss": 0.73479724, + "learning_rate": 2.868121077163024e-07, + "loss": 0.75667512, + "num_input_tokens_seen": 149613895, + "step": 6924, + "time_per_iteration": 3.296491861343384 + }, + { + "auxiliary_loss_clip": 0.01154852, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.0455606, + "balance_loss_mlp": 1.02177751, + "epoch": 0.8326820176757049, + "flos": 18369457741440.0, + "grad_norm": 1.7263277654853793, + "language_loss": 0.72476411, + "learning_rate": 2.864102952368257e-07, + "loss": 0.74660033, + "num_input_tokens_seen": 149631820, + "step": 6925, + "time_per_iteration": 2.4176251888275146 + }, + { + "auxiliary_loss_clip": 0.0109671, + "auxiliary_loss_mlp": 0.01025103, + "balance_loss_clip": 1.0358516, + "balance_loss_mlp": 1.01763415, + "epoch": 0.832802260566344, + "flos": 35991325716480.0, + "grad_norm": 1.32661437481317, + "language_loss": 0.59605002, + "learning_rate": 2.860087427075444e-07, + "loss": 0.61726815, + "num_input_tokens_seen": 149656070, + "step": 6926, + "time_per_iteration": 2.6803042888641357 + }, + { + "auxiliary_loss_clip": 0.01131479, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.0436573, + "balance_loss_mlp": 1.02074826, + "epoch": 0.8329225034569832, + "flos": 14244434928000.0, + "grad_norm": 2.473242672375076, + "language_loss": 0.86173701, + "learning_rate": 2.856074501893744e-07, + "loss": 0.8833276, + "num_input_tokens_seen": 149671270, + "step": 6927, + "time_per_iteration": 2.436058521270752 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.04841924, + "balance_loss_mlp": 1.01699221, + "epoch": 0.8330427463476222, + "flos": 18077468083200.0, + "grad_norm": 1.6385088483415946, + "language_loss": 0.81599665, + "learning_rate": 2.8520641774319054e-07, + "loss": 0.83778775, + "num_input_tokens_seen": 149689360, + "step": 6928, + "time_per_iteration": 3.199218511581421 + }, + { + "auxiliary_loss_clip": 0.01137855, + "auxiliary_loss_mlp": 0.01024733, + "balance_loss_clip": 1.04070127, + "balance_loss_mlp": 1.01763415, + "epoch": 0.8331629892382613, + "flos": 18040839189120.0, + "grad_norm": 2.275202050732897, + "language_loss": 0.75433743, + "learning_rate": 2.848056454298309e-07, + "loss": 0.77596325, + "num_input_tokens_seen": 149706685, + "step": 6929, + "time_per_iteration": 2.4604151248931885 + }, + { + "auxiliary_loss_clip": 0.0113524, + "auxiliary_loss_mlp": 0.0102128, + "balance_loss_clip": 1.04547763, + "balance_loss_mlp": 1.0141964, + "epoch": 0.8332832321289004, + "flos": 17457398576640.0, + "grad_norm": 1.8827614930271386, + "language_loss": 0.65574348, + "learning_rate": 2.844051333100905e-07, + "loss": 0.67730868, + "num_input_tokens_seen": 149724230, + "step": 6930, + "time_per_iteration": 2.449049711227417 + }, + { + "auxiliary_loss_clip": 0.01138384, + "auxiliary_loss_mlp": 0.01021326, + "balance_loss_clip": 1.04648757, + "balance_loss_mlp": 1.01484096, + "epoch": 0.8334034750195395, + "flos": 15084852416640.0, + "grad_norm": 1.9847170586925396, + "language_loss": 0.8394497, + "learning_rate": 2.840048814447269e-07, + "loss": 0.86104679, + "num_input_tokens_seen": 149742395, + "step": 6931, + "time_per_iteration": 2.453190565109253 + }, + { + "auxiliary_loss_clip": 0.01131392, + "auxiliary_loss_mlp": 0.01021846, + "balance_loss_clip": 1.04281545, + "balance_loss_mlp": 1.0149827, + "epoch": 0.8335237179101785, + "flos": 19427170556160.0, + "grad_norm": 2.315069979658486, + "language_loss": 0.739308, + "learning_rate": 2.836048898944587e-07, + "loss": 0.76084042, + "num_input_tokens_seen": 149760820, + "step": 6932, + "time_per_iteration": 2.454880952835083 + }, + { + "auxiliary_loss_clip": 0.0113615, + "auxiliary_loss_mlp": 0.01024557, + "balance_loss_clip": 1.04366779, + "balance_loss_mlp": 1.01809025, + "epoch": 0.8336439608008177, + "flos": 21762046327680.0, + "grad_norm": 2.6239200687743787, + "language_loss": 0.72031128, + "learning_rate": 2.832051587199642e-07, + "loss": 0.74191839, + "num_input_tokens_seen": 149778075, + "step": 6933, + "time_per_iteration": 2.458258867263794 + }, + { + "auxiliary_loss_clip": 0.01053453, + "auxiliary_loss_mlp": 0.01002175, + "balance_loss_clip": 1.00882685, + "balance_loss_mlp": 1.00120354, + "epoch": 0.8337642036914568, + "flos": 59702783990400.0, + "grad_norm": 0.8098193649508391, + "language_loss": 0.57789481, + "learning_rate": 2.828056879818821e-07, + "loss": 0.59845108, + "num_input_tokens_seen": 149837150, + "step": 6934, + "time_per_iteration": 2.9578049182891846 + }, + { + "auxiliary_loss_clip": 0.01121406, + "auxiliary_loss_mlp": 0.01024675, + "balance_loss_clip": 1.03947902, + "balance_loss_mlp": 1.01855052, + "epoch": 0.8338844465820958, + "flos": 27162185022720.0, + "grad_norm": 1.729355543817932, + "language_loss": 0.83444381, + "learning_rate": 2.824064777408117e-07, + "loss": 0.85590464, + "num_input_tokens_seen": 149856940, + "step": 6935, + "time_per_iteration": 2.5528767108917236 + }, + { + "auxiliary_loss_clip": 0.01150406, + "auxiliary_loss_mlp": 0.01024755, + "balance_loss_clip": 1.04656959, + "balance_loss_mlp": 1.01760292, + "epoch": 0.8340046894727349, + "flos": 30481264425600.0, + "grad_norm": 1.8801422958469023, + "language_loss": 0.75670588, + "learning_rate": 2.8200752805731263e-07, + "loss": 0.77845752, + "num_input_tokens_seen": 149879930, + "step": 6936, + "time_per_iteration": 2.507885456085205 + }, + { + "auxiliary_loss_clip": 0.01149698, + "auxiliary_loss_mlp": 0.01023174, + "balance_loss_clip": 1.04594362, + "balance_loss_mlp": 1.01621246, + "epoch": 0.834124932363374, + "flos": 27126166659840.0, + "grad_norm": 1.6709066521178355, + "language_loss": 0.81128395, + "learning_rate": 2.8160883899190625e-07, + "loss": 0.8330127, + "num_input_tokens_seen": 149903200, + "step": 6937, + "time_per_iteration": 2.4954211711883545 + }, + { + "auxiliary_loss_clip": 0.01116148, + "auxiliary_loss_mlp": 0.01024616, + "balance_loss_clip": 1.04419136, + "balance_loss_mlp": 1.0178299, + "epoch": 0.8342451752540131, + "flos": 24569865498240.0, + "grad_norm": 2.1986581566204837, + "language_loss": 0.72984147, + "learning_rate": 2.8121041060507234e-07, + "loss": 0.75124907, + "num_input_tokens_seen": 149922230, + "step": 6938, + "time_per_iteration": 2.5200958251953125 + }, + { + "auxiliary_loss_clip": 0.01155553, + "auxiliary_loss_mlp": 0.01021479, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.01440072, + "epoch": 0.8343654181446521, + "flos": 26615085995520.0, + "grad_norm": 1.5648783740553736, + "language_loss": 0.71559078, + "learning_rate": 2.808122429572528e-07, + "loss": 0.73736107, + "num_input_tokens_seen": 149942435, + "step": 6939, + "time_per_iteration": 2.477778911590576 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.04283655, + "balance_loss_mlp": 1.0185318, + "epoch": 0.8344856610352913, + "flos": 20777268078720.0, + "grad_norm": 2.8486585117066476, + "language_loss": 0.76067793, + "learning_rate": 2.804143361088489e-07, + "loss": 0.78224194, + "num_input_tokens_seen": 149961615, + "step": 6940, + "time_per_iteration": 2.499452829360962 + }, + { + "auxiliary_loss_clip": 0.0113024, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.04214132, + "balance_loss_mlp": 1.0218333, + "epoch": 0.8346059039259304, + "flos": 26095960684800.0, + "grad_norm": 2.387089435526413, + "language_loss": 0.78118169, + "learning_rate": 2.8001669012022277e-07, + "loss": 0.80277801, + "num_input_tokens_seen": 149979585, + "step": 6941, + "time_per_iteration": 2.493180274963379 + }, + { + "auxiliary_loss_clip": 0.01151673, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.04898906, + "balance_loss_mlp": 1.01939154, + "epoch": 0.8347261468165694, + "flos": 29027708755200.0, + "grad_norm": 1.7584457250076992, + "language_loss": 0.69360065, + "learning_rate": 2.7961930505169795e-07, + "loss": 0.71538413, + "num_input_tokens_seen": 150003830, + "step": 6942, + "time_per_iteration": 2.5236563682556152 + }, + { + "auxiliary_loss_clip": 0.01153725, + "auxiliary_loss_mlp": 0.00761738, + "balance_loss_clip": 1.04787278, + "balance_loss_mlp": 1.00048041, + "epoch": 0.8348463897072086, + "flos": 26396461866240.0, + "grad_norm": 2.015232040819183, + "language_loss": 0.76177752, + "learning_rate": 2.792221809635558e-07, + "loss": 0.78093219, + "num_input_tokens_seen": 150024460, + "step": 6943, + "time_per_iteration": 2.494431257247925 + }, + { + "auxiliary_loss_clip": 0.01086584, + "auxiliary_loss_mlp": 0.01023602, + "balance_loss_clip": 1.04176188, + "balance_loss_mlp": 1.01638675, + "epoch": 0.8349666325978476, + "flos": 23367720096000.0, + "grad_norm": 2.1210041043345345, + "language_loss": 0.7506578, + "learning_rate": 2.788253179160411e-07, + "loss": 0.77175963, + "num_input_tokens_seen": 150045620, + "step": 6944, + "time_per_iteration": 2.7207767963409424 + }, + { + "auxiliary_loss_clip": 0.01136251, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.04426312, + "balance_loss_mlp": 1.02097559, + "epoch": 0.8350868754884867, + "flos": 12896528135040.0, + "grad_norm": 1.8673273006479607, + "language_loss": 0.64921761, + "learning_rate": 2.7842871596935725e-07, + "loss": 0.67085636, + "num_input_tokens_seen": 150064135, + "step": 6945, + "time_per_iteration": 2.9518702030181885 + }, + { + "auxiliary_loss_clip": 0.01154998, + "auxiliary_loss_mlp": 0.0101947, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.01249576, + "epoch": 0.8352071183791259, + "flos": 26505522535680.0, + "grad_norm": 1.6611386414730471, + "language_loss": 0.68976617, + "learning_rate": 2.780323751836682e-07, + "loss": 0.71151078, + "num_input_tokens_seen": 150085350, + "step": 6946, + "time_per_iteration": 2.495692729949951 + }, + { + "auxiliary_loss_clip": 0.01134376, + "auxiliary_loss_mlp": 0.00761209, + "balance_loss_clip": 1.04079604, + "balance_loss_mlp": 1.00044143, + "epoch": 0.8353273612697649, + "flos": 20668063754880.0, + "grad_norm": 1.4186230174993504, + "language_loss": 0.78450453, + "learning_rate": 2.7763629561909876e-07, + "loss": 0.80346036, + "num_input_tokens_seen": 150106180, + "step": 6947, + "time_per_iteration": 2.4927756786346436 + }, + { + "auxiliary_loss_clip": 0.01161122, + "auxiliary_loss_mlp": 0.01020948, + "balance_loss_clip": 1.04444301, + "balance_loss_mlp": 1.01373017, + "epoch": 0.835447604160404, + "flos": 19754137082880.0, + "grad_norm": 1.901840883099076, + "language_loss": 0.76970482, + "learning_rate": 2.772404773357335e-07, + "loss": 0.79152548, + "num_input_tokens_seen": 150125585, + "step": 6948, + "time_per_iteration": 3.2674615383148193 + }, + { + "auxiliary_loss_clip": 0.01117828, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.04270077, + "balance_loss_mlp": 1.01764548, + "epoch": 0.8355678470510431, + "flos": 23435842239360.0, + "grad_norm": 1.755933792338582, + "language_loss": 0.78216481, + "learning_rate": 2.7684492039361853e-07, + "loss": 0.80359209, + "num_input_tokens_seen": 150144810, + "step": 6949, + "time_per_iteration": 3.301015853881836 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01024814, + "balance_loss_clip": 1.04882836, + "balance_loss_mlp": 1.01816261, + "epoch": 0.8356880899416822, + "flos": 21214588164480.0, + "grad_norm": 2.087516574864285, + "language_loss": 0.83766705, + "learning_rate": 2.764496248527586e-07, + "loss": 0.85957968, + "num_input_tokens_seen": 150163785, + "step": 6950, + "time_per_iteration": 3.195256471633911 + }, + { + "auxiliary_loss_clip": 0.0113307, + "auxiliary_loss_mlp": 0.01024228, + "balance_loss_clip": 1.0437808, + "balance_loss_mlp": 1.0173614, + "epoch": 0.8358083328323213, + "flos": 28037543466240.0, + "grad_norm": 2.3510273278985987, + "language_loss": 0.78784251, + "learning_rate": 2.760545907731211e-07, + "loss": 0.80941546, + "num_input_tokens_seen": 150184360, + "step": 6951, + "time_per_iteration": 2.5752291679382324 + }, + { + "auxiliary_loss_clip": 0.01151516, + "auxiliary_loss_mlp": 0.0102019, + "balance_loss_clip": 1.04373634, + "balance_loss_mlp": 1.01315629, + "epoch": 0.8359285757229604, + "flos": 27783655159680.0, + "grad_norm": 1.70605508551502, + "language_loss": 0.6787008, + "learning_rate": 2.75659818214631e-07, + "loss": 0.70041788, + "num_input_tokens_seen": 150205465, + "step": 6952, + "time_per_iteration": 2.4936938285827637 + }, + { + "auxiliary_loss_clip": 0.01138847, + "auxiliary_loss_mlp": 0.01021823, + "balance_loss_clip": 1.04327345, + "balance_loss_mlp": 1.0147959, + "epoch": 0.8360488186135995, + "flos": 21435115714560.0, + "grad_norm": 2.805824680037939, + "language_loss": 0.78170407, + "learning_rate": 2.752653072371749e-07, + "loss": 0.80331075, + "num_input_tokens_seen": 150224900, + "step": 6953, + "time_per_iteration": 2.4774389266967773 + }, + { + "auxiliary_loss_clip": 0.0111829, + "auxiliary_loss_mlp": 0.01025847, + "balance_loss_clip": 1.04366326, + "balance_loss_mlp": 1.01947212, + "epoch": 0.8361690615042385, + "flos": 27632327160960.0, + "grad_norm": 1.7937690686683287, + "language_loss": 0.7484467, + "learning_rate": 2.7487105790060105e-07, + "loss": 0.7698881, + "num_input_tokens_seen": 150244310, + "step": 6954, + "time_per_iteration": 2.5428404808044434 + }, + { + "auxiliary_loss_clip": 0.01152706, + "auxiliary_loss_mlp": 0.01024512, + "balance_loss_clip": 1.04540277, + "balance_loss_mlp": 1.01797664, + "epoch": 0.8362893043948777, + "flos": 39202529598720.0, + "grad_norm": 1.9532298233738328, + "language_loss": 0.69287205, + "learning_rate": 2.7447707026471587e-07, + "loss": 0.71464431, + "num_input_tokens_seen": 150267285, + "step": 6955, + "time_per_iteration": 3.3278300762176514 + }, + { + "auxiliary_loss_clip": 0.01122584, + "auxiliary_loss_mlp": 0.0102251, + "balance_loss_clip": 1.04156768, + "balance_loss_mlp": 1.01582241, + "epoch": 0.8364095472855168, + "flos": 24785329230720.0, + "grad_norm": 1.9375372219850042, + "language_loss": 0.79851127, + "learning_rate": 2.740833443892874e-07, + "loss": 0.81996214, + "num_input_tokens_seen": 150285455, + "step": 6956, + "time_per_iteration": 2.5371198654174805 + }, + { + "auxiliary_loss_clip": 0.01135907, + "auxiliary_loss_mlp": 0.01021642, + "balance_loss_clip": 1.04385018, + "balance_loss_mlp": 1.01459134, + "epoch": 0.8365297901761558, + "flos": 22743412784640.0, + "grad_norm": 1.9392602941511379, + "language_loss": 0.79682028, + "learning_rate": 2.7368988033404327e-07, + "loss": 0.81839573, + "num_input_tokens_seen": 150302970, + "step": 6957, + "time_per_iteration": 2.468261957168579 + }, + { + "auxiliary_loss_clip": 0.01125615, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.04395747, + "balance_loss_mlp": 1.01799953, + "epoch": 0.836650033066795, + "flos": 28396003242240.0, + "grad_norm": 1.5531683819212478, + "language_loss": 0.84720874, + "learning_rate": 2.732966781586712e-07, + "loss": 0.86870801, + "num_input_tokens_seen": 150322715, + "step": 6958, + "time_per_iteration": 2.5641021728515625 + }, + { + "auxiliary_loss_clip": 0.01144791, + "auxiliary_loss_mlp": 0.0102156, + "balance_loss_clip": 1.04266, + "balance_loss_mlp": 1.01514602, + "epoch": 0.836770275957434, + "flos": 22236857233920.0, + "grad_norm": 1.7216225154039906, + "language_loss": 0.66908062, + "learning_rate": 2.729037379228205e-07, + "loss": 0.69074416, + "num_input_tokens_seen": 150342900, + "step": 6959, + "time_per_iteration": 2.4576852321624756 + }, + { + "auxiliary_loss_clip": 0.01137675, + "auxiliary_loss_mlp": 0.01026606, + "balance_loss_clip": 1.04780185, + "balance_loss_mlp": 1.01967692, + "epoch": 0.8368905188480731, + "flos": 22491930689280.0, + "grad_norm": 1.4476338864757565, + "language_loss": 0.8069762, + "learning_rate": 2.725110596860998e-07, + "loss": 0.828619, + "num_input_tokens_seen": 150363580, + "step": 6960, + "time_per_iteration": 2.4842004776000977 + }, + { + "auxiliary_loss_clip": 0.01104386, + "auxiliary_loss_mlp": 0.01022301, + "balance_loss_clip": 1.04132843, + "balance_loss_mlp": 1.01566148, + "epoch": 0.8370107617387123, + "flos": 13370405287680.0, + "grad_norm": 2.1321467620516, + "language_loss": 0.70102775, + "learning_rate": 2.7211864350807776e-07, + "loss": 0.72229463, + "num_input_tokens_seen": 150381780, + "step": 6961, + "time_per_iteration": 2.521544933319092 + }, + { + "auxiliary_loss_clip": 0.01163386, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.04560983, + "balance_loss_mlp": 1.01908493, + "epoch": 0.8371310046293513, + "flos": 25261289372160.0, + "grad_norm": 1.703790546895152, + "language_loss": 0.74247926, + "learning_rate": 2.717264894482836e-07, + "loss": 0.76437712, + "num_input_tokens_seen": 150402120, + "step": 6962, + "time_per_iteration": 2.464632749557495 + }, + { + "auxiliary_loss_clip": 0.01154373, + "auxiliary_loss_mlp": 0.01023766, + "balance_loss_clip": 1.04802001, + "balance_loss_mlp": 1.01622856, + "epoch": 0.8372512475199904, + "flos": 19792705311360.0, + "grad_norm": 2.007787186534202, + "language_loss": 0.81070566, + "learning_rate": 2.7133459756620646e-07, + "loss": 0.83248711, + "num_input_tokens_seen": 150419315, + "step": 6963, + "time_per_iteration": 2.468907594680786 + }, + { + "auxiliary_loss_clip": 0.01145278, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.0443536, + "balance_loss_mlp": 1.02338147, + "epoch": 0.8373714904106295, + "flos": 19391224020480.0, + "grad_norm": 1.7195082167261235, + "language_loss": 0.73618448, + "learning_rate": 2.7094296792129733e-07, + "loss": 0.75793993, + "num_input_tokens_seen": 150438915, + "step": 6964, + "time_per_iteration": 2.430236577987671 + }, + { + "auxiliary_loss_clip": 0.01150345, + "auxiliary_loss_mlp": 0.01022109, + "balance_loss_clip": 1.04481459, + "balance_loss_mlp": 1.01544523, + "epoch": 0.8374917333012686, + "flos": 14975935401600.0, + "grad_norm": 1.7689790264549623, + "language_loss": 0.75638026, + "learning_rate": 2.7055160057296424e-07, + "loss": 0.77810478, + "num_input_tokens_seen": 150456155, + "step": 6965, + "time_per_iteration": 2.5920419692993164 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01023584, + "balance_loss_clip": 1.04243863, + "balance_loss_mlp": 1.01647019, + "epoch": 0.8376119761919076, + "flos": 30331839847680.0, + "grad_norm": 1.5715509710924365, + "language_loss": 0.72631294, + "learning_rate": 2.7016049558057896e-07, + "loss": 0.74776149, + "num_input_tokens_seen": 150478115, + "step": 6966, + "time_per_iteration": 2.5752642154693604 + }, + { + "auxiliary_loss_clip": 0.01149372, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.04579222, + "balance_loss_mlp": 1.02147007, + "epoch": 0.8377322190825467, + "flos": 29423336129280.0, + "grad_norm": 1.6801702603297883, + "language_loss": 0.70689237, + "learning_rate": 2.6976965300347074e-07, + "loss": 0.72867548, + "num_input_tokens_seen": 150500725, + "step": 6967, + "time_per_iteration": 2.5245361328125 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01019616, + "balance_loss_clip": 1.04240608, + "balance_loss_mlp": 1.01227856, + "epoch": 0.8378524619731859, + "flos": 26687086807680.0, + "grad_norm": 2.9911795981002314, + "language_loss": 0.69328952, + "learning_rate": 2.693790729009309e-07, + "loss": 0.71481085, + "num_input_tokens_seen": 150522335, + "step": 6968, + "time_per_iteration": 2.5130228996276855 + }, + { + "auxiliary_loss_clip": 0.01135986, + "auxiliary_loss_mlp": 0.01022417, + "balance_loss_clip": 1.04358733, + "balance_loss_mlp": 1.01564646, + "epoch": 0.8379727048638249, + "flos": 20703866636160.0, + "grad_norm": 1.808333112762289, + "language_loss": 0.8842935, + "learning_rate": 2.6898875533220946e-07, + "loss": 0.90587747, + "num_input_tokens_seen": 150541640, + "step": 6969, + "time_per_iteration": 2.480952739715576 + }, + { + "auxiliary_loss_clip": 0.01160835, + "auxiliary_loss_mlp": 0.01021923, + "balance_loss_clip": 1.04758787, + "balance_loss_mlp": 1.01572394, + "epoch": 0.838092947754464, + "flos": 20084084438400.0, + "grad_norm": 1.9256385222617913, + "language_loss": 0.81550467, + "learning_rate": 2.685987003565171e-07, + "loss": 0.83733219, + "num_input_tokens_seen": 150559680, + "step": 6970, + "time_per_iteration": 2.430691957473755 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01023381, + "balance_loss_clip": 1.04733825, + "balance_loss_mlp": 1.01575792, + "epoch": 0.8382131906451031, + "flos": 18113270964480.0, + "grad_norm": 3.0809953122499074, + "language_loss": 0.74431747, + "learning_rate": 2.6820890803302566e-07, + "loss": 0.7657584, + "num_input_tokens_seen": 150575205, + "step": 6971, + "time_per_iteration": 2.490973711013794 + }, + { + "auxiliary_loss_clip": 0.01137266, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.04685569, + "balance_loss_mlp": 1.01451862, + "epoch": 0.8383334335357422, + "flos": 17092653920640.0, + "grad_norm": 2.0855543562827252, + "language_loss": 0.81963855, + "learning_rate": 2.6781937842086557e-07, + "loss": 0.84122574, + "num_input_tokens_seen": 150593995, + "step": 6972, + "time_per_iteration": 2.4419753551483154 + }, + { + "auxiliary_loss_clip": 0.01152122, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.0447402, + "balance_loss_mlp": 1.02017021, + "epoch": 0.8384536764263812, + "flos": 20704728562560.0, + "grad_norm": 2.1998285428165913, + "language_loss": 0.67823023, + "learning_rate": 2.6743011157912933e-07, + "loss": 0.70002496, + "num_input_tokens_seen": 150613715, + "step": 6973, + "time_per_iteration": 2.4497125148773193 + }, + { + "auxiliary_loss_clip": 0.01106161, + "auxiliary_loss_mlp": 0.01023144, + "balance_loss_clip": 1.03709149, + "balance_loss_mlp": 1.01575351, + "epoch": 0.8385739193170204, + "flos": 28986842056320.0, + "grad_norm": 1.7955662608864091, + "language_loss": 0.65369266, + "learning_rate": 2.6704110756686725e-07, + "loss": 0.67498571, + "num_input_tokens_seen": 150634540, + "step": 6974, + "time_per_iteration": 2.6011180877685547 + }, + { + "auxiliary_loss_clip": 0.01133066, + "auxiliary_loss_mlp": 0.00761537, + "balance_loss_clip": 1.04210615, + "balance_loss_mlp": 1.00039816, + "epoch": 0.8386941622076595, + "flos": 23438068882560.0, + "grad_norm": 1.6424947887458081, + "language_loss": 0.83731735, + "learning_rate": 2.6665236644309085e-07, + "loss": 0.8562634, + "num_input_tokens_seen": 150654850, + "step": 6975, + "time_per_iteration": 3.338249444961548 + }, + { + "auxiliary_loss_clip": 0.01149939, + "auxiliary_loss_mlp": 0.01022066, + "balance_loss_clip": 1.04336858, + "balance_loss_mlp": 1.01553047, + "epoch": 0.8388144050982985, + "flos": 23002724044800.0, + "grad_norm": 1.8162602908230185, + "language_loss": 0.79731917, + "learning_rate": 2.662638882667727e-07, + "loss": 0.81903923, + "num_input_tokens_seen": 150673790, + "step": 6976, + "time_per_iteration": 4.033307790756226 + }, + { + "auxiliary_loss_clip": 0.01167135, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.0469172, + "balance_loss_mlp": 1.0177809, + "epoch": 0.8389346479889377, + "flos": 24280353878400.0, + "grad_norm": 2.0341288116991194, + "language_loss": 0.72796947, + "learning_rate": 2.658756730968443e-07, + "loss": 0.74989176, + "num_input_tokens_seen": 150692255, + "step": 6977, + "time_per_iteration": 2.4284560680389404 + }, + { + "auxiliary_loss_clip": 0.01142641, + "auxiliary_loss_mlp": 0.01021912, + "balance_loss_clip": 1.048944, + "balance_loss_mlp": 1.01503658, + "epoch": 0.8390548908795767, + "flos": 21215019127680.0, + "grad_norm": 2.2504662227208594, + "language_loss": 0.88496709, + "learning_rate": 2.654877209921975e-07, + "loss": 0.90661263, + "num_input_tokens_seen": 150709790, + "step": 6978, + "time_per_iteration": 2.4695749282836914 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.04012942, + "balance_loss_mlp": 1.01961565, + "epoch": 0.8391751337702158, + "flos": 35627299332480.0, + "grad_norm": 4.455165390242117, + "language_loss": 0.63079774, + "learning_rate": 2.651000320116843e-07, + "loss": 0.65222108, + "num_input_tokens_seen": 150730675, + "step": 6979, + "time_per_iteration": 2.661766290664673 + }, + { + "auxiliary_loss_clip": 0.01118862, + "auxiliary_loss_mlp": 0.00762102, + "balance_loss_clip": 1.04148579, + "balance_loss_mlp": 1.00054991, + "epoch": 0.839295376660855, + "flos": 21325229032320.0, + "grad_norm": 2.0538298703010174, + "language_loss": 0.7569896, + "learning_rate": 2.647126062141163e-07, + "loss": 0.77579921, + "num_input_tokens_seen": 150749750, + "step": 6980, + "time_per_iteration": 2.5189621448516846 + }, + { + "auxiliary_loss_clip": 0.01139241, + "auxiliary_loss_mlp": 0.0102409, + "balance_loss_clip": 1.04163551, + "balance_loss_mlp": 1.01731288, + "epoch": 0.839415619551494, + "flos": 18442535961600.0, + "grad_norm": 1.668233801528303, + "language_loss": 0.83698547, + "learning_rate": 2.643254436582669e-07, + "loss": 0.85861874, + "num_input_tokens_seen": 150769240, + "step": 6981, + "time_per_iteration": 2.4516139030456543 + }, + { + "auxiliary_loss_clip": 0.01109935, + "auxiliary_loss_mlp": 0.01022436, + "balance_loss_clip": 1.04181814, + "balance_loss_mlp": 1.01553988, + "epoch": 0.8395358624421331, + "flos": 23221958705280.0, + "grad_norm": 1.783339782362761, + "language_loss": 0.82419789, + "learning_rate": 2.6393854440286743e-07, + "loss": 0.84552157, + "num_input_tokens_seen": 150788410, + "step": 6982, + "time_per_iteration": 3.3319625854492188 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01023904, + "balance_loss_clip": 1.04849601, + "balance_loss_mlp": 1.0171212, + "epoch": 0.8396561053327722, + "flos": 24381657210240.0, + "grad_norm": 2.569599960357759, + "language_loss": 0.70636833, + "learning_rate": 2.6355190850661045e-07, + "loss": 0.72825789, + "num_input_tokens_seen": 150805245, + "step": 6983, + "time_per_iteration": 2.415895700454712 + }, + { + "auxiliary_loss_clip": 0.01138006, + "auxiliary_loss_mlp": 0.0102481, + "balance_loss_clip": 1.04539049, + "balance_loss_mlp": 1.01755309, + "epoch": 0.8397763482234113, + "flos": 22237755073920.0, + "grad_norm": 1.5791957367098992, + "language_loss": 0.86302984, + "learning_rate": 2.631655360281486e-07, + "loss": 0.88465798, + "num_input_tokens_seen": 150824920, + "step": 6984, + "time_per_iteration": 2.4833171367645264 + }, + { + "auxiliary_loss_clip": 0.01154089, + "auxiliary_loss_mlp": 0.00761754, + "balance_loss_clip": 1.04390144, + "balance_loss_mlp": 1.0005095, + "epoch": 0.8398965911140504, + "flos": 22163743100160.0, + "grad_norm": 1.7952765267680695, + "language_loss": 0.65966392, + "learning_rate": 2.6277942702609323e-07, + "loss": 0.67882234, + "num_input_tokens_seen": 150844400, + "step": 6985, + "time_per_iteration": 2.4489188194274902 + }, + { + "auxiliary_loss_clip": 0.01124673, + "auxiliary_loss_mlp": 0.01026844, + "balance_loss_clip": 1.04439592, + "balance_loss_mlp": 1.01978111, + "epoch": 0.8400168340046895, + "flos": 21542775753600.0, + "grad_norm": 1.962985614768301, + "language_loss": 0.8769843, + "learning_rate": 2.623935815590186e-07, + "loss": 0.89849943, + "num_input_tokens_seen": 150862780, + "step": 6986, + "time_per_iteration": 2.5191588401794434 + }, + { + "auxiliary_loss_clip": 0.01139752, + "auxiliary_loss_mlp": 0.01022126, + "balance_loss_clip": 1.04759479, + "balance_loss_mlp": 1.01508653, + "epoch": 0.8401370768953286, + "flos": 22491966602880.0, + "grad_norm": 1.793309077995339, + "language_loss": 0.80801427, + "learning_rate": 2.6200799968545516e-07, + "loss": 0.829633, + "num_input_tokens_seen": 150883075, + "step": 6987, + "time_per_iteration": 2.4853620529174805 + }, + { + "auxiliary_loss_clip": 0.01038587, + "auxiliary_loss_mlp": 0.0100071, + "balance_loss_clip": 1.01141524, + "balance_loss_mlp": 0.99992907, + "epoch": 0.8402573197859676, + "flos": 59238890818560.0, + "grad_norm": 0.7862854362295917, + "language_loss": 0.56427372, + "learning_rate": 2.616226814638969e-07, + "loss": 0.58466667, + "num_input_tokens_seen": 150948180, + "step": 6988, + "time_per_iteration": 3.092822551727295 + }, + { + "auxiliary_loss_clip": 0.01138336, + "auxiliary_loss_mlp": 0.0102241, + "balance_loss_clip": 1.04511523, + "balance_loss_mlp": 1.0157733, + "epoch": 0.8403775626766068, + "flos": 22674608282880.0, + "grad_norm": 1.905849159551543, + "language_loss": 0.77137804, + "learning_rate": 2.612376269527954e-07, + "loss": 0.79298556, + "num_input_tokens_seen": 150967885, + "step": 6989, + "time_per_iteration": 2.5085322856903076 + }, + { + "auxiliary_loss_clip": 0.01133763, + "auxiliary_loss_mlp": 0.01025864, + "balance_loss_clip": 1.04457855, + "balance_loss_mlp": 1.01916468, + "epoch": 0.8404978055672458, + "flos": 19609704495360.0, + "grad_norm": 1.669140990877128, + "language_loss": 0.67639565, + "learning_rate": 2.608528362105635e-07, + "loss": 0.69799197, + "num_input_tokens_seen": 150987255, + "step": 6990, + "time_per_iteration": 2.461534023284912 + }, + { + "auxiliary_loss_clip": 0.01122708, + "auxiliary_loss_mlp": 0.01022297, + "balance_loss_clip": 1.03927422, + "balance_loss_mlp": 1.01555538, + "epoch": 0.8406180484578849, + "flos": 27526929678720.0, + "grad_norm": 1.9221059056010676, + "language_loss": 0.7348206, + "learning_rate": 2.6046830929557374e-07, + "loss": 0.75627065, + "num_input_tokens_seen": 151006905, + "step": 6991, + "time_per_iteration": 2.5702147483825684 + }, + { + "auxiliary_loss_clip": 0.01119833, + "auxiliary_loss_mlp": 0.01023608, + "balance_loss_clip": 1.04361022, + "balance_loss_mlp": 1.01688504, + "epoch": 0.8407382913485241, + "flos": 22127473342080.0, + "grad_norm": 1.9428695315506979, + "language_loss": 0.84759438, + "learning_rate": 2.6008404626615776e-07, + "loss": 0.86902881, + "num_input_tokens_seen": 151025405, + "step": 6992, + "time_per_iteration": 2.5197558403015137 + }, + { + "auxiliary_loss_clip": 0.01157517, + "auxiliary_loss_mlp": 0.01023965, + "balance_loss_clip": 1.04906511, + "balance_loss_mlp": 1.01738441, + "epoch": 0.8408585342391631, + "flos": 13918473982080.0, + "grad_norm": 3.084258260778641, + "language_loss": 0.74324363, + "learning_rate": 2.597000471806092e-07, + "loss": 0.7650584, + "num_input_tokens_seen": 151041970, + "step": 6993, + "time_per_iteration": 2.442943572998047 + }, + { + "auxiliary_loss_clip": 0.01135269, + "auxiliary_loss_mlp": 0.01023159, + "balance_loss_clip": 1.04818904, + "balance_loss_mlp": 1.01504421, + "epoch": 0.8409787771298022, + "flos": 20187865808640.0, + "grad_norm": 1.9098399148648195, + "language_loss": 0.73390204, + "learning_rate": 2.593163120971793e-07, + "loss": 0.75548637, + "num_input_tokens_seen": 151060835, + "step": 6994, + "time_per_iteration": 2.439973831176758 + }, + { + "auxiliary_loss_clip": 0.01099618, + "auxiliary_loss_mlp": 0.010228, + "balance_loss_clip": 1.03676105, + "balance_loss_mlp": 1.01613069, + "epoch": 0.8410990200204413, + "flos": 23142523777920.0, + "grad_norm": 1.7711748046407754, + "language_loss": 0.68708283, + "learning_rate": 2.5893284107408165e-07, + "loss": 0.70830703, + "num_input_tokens_seen": 151078205, + "step": 6995, + "time_per_iteration": 2.5352261066436768 + }, + { + "auxiliary_loss_clip": 0.01111214, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.0441314, + "balance_loss_mlp": 1.02020264, + "epoch": 0.8412192629110804, + "flos": 24027219757440.0, + "grad_norm": 1.7909777825080915, + "language_loss": 0.77824235, + "learning_rate": 2.5854963416948726e-07, + "loss": 0.79963195, + "num_input_tokens_seen": 151100470, + "step": 6996, + "time_per_iteration": 2.6048130989074707 + }, + { + "auxiliary_loss_clip": 0.01107499, + "auxiliary_loss_mlp": 0.01024388, + "balance_loss_clip": 1.03611398, + "balance_loss_mlp": 1.01739049, + "epoch": 0.8413395058017195, + "flos": 25591703604480.0, + "grad_norm": 2.029241840102886, + "language_loss": 0.69332278, + "learning_rate": 2.5816669144152816e-07, + "loss": 0.71464163, + "num_input_tokens_seen": 151121650, + "step": 6997, + "time_per_iteration": 2.5674712657928467 + }, + { + "auxiliary_loss_clip": 0.01061934, + "auxiliary_loss_mlp": 0.01000901, + "balance_loss_clip": 1.00829625, + "balance_loss_mlp": 1.00005424, + "epoch": 0.8414597486923585, + "flos": 63635396624640.0, + "grad_norm": 0.8665706395173894, + "language_loss": 0.66371548, + "learning_rate": 2.5778401294829777e-07, + "loss": 0.68434381, + "num_input_tokens_seen": 151180390, + "step": 6998, + "time_per_iteration": 3.0523874759674072 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.00761816, + "balance_loss_clip": 1.04551339, + "balance_loss_mlp": 1.00046432, + "epoch": 0.8415799915829977, + "flos": 19098731571840.0, + "grad_norm": 1.6629609593982126, + "language_loss": 0.65053612, + "learning_rate": 2.574015987478473e-07, + "loss": 0.66963595, + "num_input_tokens_seen": 151198520, + "step": 6999, + "time_per_iteration": 2.4193525314331055 + }, + { + "auxiliary_loss_clip": 0.01142237, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.04505825, + "balance_loss_mlp": 1.01735151, + "epoch": 0.8417002344736367, + "flos": 19821612781440.0, + "grad_norm": 2.4083744176707147, + "language_loss": 0.86991733, + "learning_rate": 2.570194488981887e-07, + "loss": 0.89158493, + "num_input_tokens_seen": 151215065, + "step": 7000, + "time_per_iteration": 2.569504499435425 + }, + { + "auxiliary_loss_clip": 0.01062398, + "auxiliary_loss_mlp": 0.0100188, + "balance_loss_clip": 1.00882101, + "balance_loss_mlp": 1.00096226, + "epoch": 0.8418204773642758, + "flos": 62161516834560.0, + "grad_norm": 0.8395303638495538, + "language_loss": 0.60327309, + "learning_rate": 2.566375634572939e-07, + "loss": 0.62391591, + "num_input_tokens_seen": 151275705, + "step": 7001, + "time_per_iteration": 3.7731032371520996 + }, + { + "auxiliary_loss_clip": 0.01127754, + "auxiliary_loss_mlp": 0.01026222, + "balance_loss_clip": 1.04184294, + "balance_loss_mlp": 1.0197494, + "epoch": 0.841940720254915, + "flos": 17092905315840.0, + "grad_norm": 2.1404324055956803, + "language_loss": 0.76612818, + "learning_rate": 2.562559424830943e-07, + "loss": 0.78766793, + "num_input_tokens_seen": 151293665, + "step": 7002, + "time_per_iteration": 3.2008044719696045 + }, + { + "auxiliary_loss_clip": 0.01135183, + "auxiliary_loss_mlp": 0.01021252, + "balance_loss_clip": 1.04324269, + "balance_loss_mlp": 1.01389098, + "epoch": 0.842060963145554, + "flos": 16283586026880.0, + "grad_norm": 2.29952970873097, + "language_loss": 0.70264083, + "learning_rate": 2.5587458603348256e-07, + "loss": 0.7242052, + "num_input_tokens_seen": 151310955, + "step": 7003, + "time_per_iteration": 3.311286211013794 + }, + { + "auxiliary_loss_clip": 0.01117077, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.04225576, + "balance_loss_mlp": 1.0195303, + "epoch": 0.8421812060361931, + "flos": 21908238681600.0, + "grad_norm": 1.9825496680862764, + "language_loss": 0.84104604, + "learning_rate": 2.554934941663085e-07, + "loss": 0.86248249, + "num_input_tokens_seen": 151328490, + "step": 7004, + "time_per_iteration": 2.5046706199645996 + }, + { + "auxiliary_loss_clip": 0.01121421, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.04192698, + "balance_loss_mlp": 1.01776981, + "epoch": 0.8423014489268322, + "flos": 27777693502080.0, + "grad_norm": 2.7630898728600113, + "language_loss": 0.73586118, + "learning_rate": 2.5511266693938484e-07, + "loss": 0.75733042, + "num_input_tokens_seen": 151346950, + "step": 7005, + "time_per_iteration": 2.54984974861145 + }, + { + "auxiliary_loss_clip": 0.01135532, + "auxiliary_loss_mlp": 0.01022897, + "balance_loss_clip": 1.04614401, + "balance_loss_mlp": 1.01546133, + "epoch": 0.8424216918174713, + "flos": 25117610970240.0, + "grad_norm": 1.6079876458492246, + "language_loss": 0.77764171, + "learning_rate": 2.547321044104822e-07, + "loss": 0.79922599, + "num_input_tokens_seen": 151368445, + "step": 7006, + "time_per_iteration": 2.513831615447998 + }, + { + "auxiliary_loss_clip": 0.01167217, + "auxiliary_loss_mlp": 0.01023001, + "balance_loss_clip": 1.04758072, + "balance_loss_mlp": 1.01549375, + "epoch": 0.8425419347081103, + "flos": 24748448941440.0, + "grad_norm": 1.634285320447925, + "language_loss": 0.76497817, + "learning_rate": 2.5435180663733113e-07, + "loss": 0.78688037, + "num_input_tokens_seen": 151388745, + "step": 7007, + "time_per_iteration": 2.4596002101898193 + }, + { + "auxiliary_loss_clip": 0.01117529, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.04068685, + "balance_loss_mlp": 1.01949883, + "epoch": 0.8426621775987495, + "flos": 24820916630400.0, + "grad_norm": 2.883054714080116, + "language_loss": 0.71693021, + "learning_rate": 2.539717736776241e-07, + "loss": 0.73836941, + "num_input_tokens_seen": 151404970, + "step": 7008, + "time_per_iteration": 2.551286220550537 + }, + { + "auxiliary_loss_clip": 0.01148071, + "auxiliary_loss_mlp": 0.01020657, + "balance_loss_clip": 1.04632854, + "balance_loss_mlp": 1.01380873, + "epoch": 0.8427824204893886, + "flos": 23550074467200.0, + "grad_norm": 1.3407034826020992, + "language_loss": 0.76278132, + "learning_rate": 2.535920055890097e-07, + "loss": 0.78446853, + "num_input_tokens_seen": 151426265, + "step": 7009, + "time_per_iteration": 3.2268190383911133 + }, + { + "auxiliary_loss_clip": 0.0110609, + "auxiliary_loss_mlp": 0.01024158, + "balance_loss_clip": 1.04093981, + "balance_loss_mlp": 1.0167129, + "epoch": 0.8429026633800276, + "flos": 16143858120960.0, + "grad_norm": 2.1513441011988768, + "language_loss": 0.64647067, + "learning_rate": 2.5321250242910006e-07, + "loss": 0.66777313, + "num_input_tokens_seen": 151444180, + "step": 7010, + "time_per_iteration": 2.519245147705078 + }, + { + "auxiliary_loss_clip": 0.01164749, + "auxiliary_loss_mlp": 0.01021121, + "balance_loss_clip": 1.04798377, + "balance_loss_mlp": 1.01430249, + "epoch": 0.8430229062706668, + "flos": 22198540400640.0, + "grad_norm": 1.7210399338278748, + "language_loss": 0.8663035, + "learning_rate": 2.5283326425546493e-07, + "loss": 0.8881622, + "num_input_tokens_seen": 151463290, + "step": 7011, + "time_per_iteration": 2.4217967987060547 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.01021769, + "balance_loss_clip": 1.04444456, + "balance_loss_mlp": 1.01512289, + "epoch": 0.8431431491613058, + "flos": 35330317683840.0, + "grad_norm": 2.4405993873849874, + "language_loss": 0.6960088, + "learning_rate": 2.5245429112563443e-07, + "loss": 0.71740985, + "num_input_tokens_seen": 151483965, + "step": 7012, + "time_per_iteration": 2.660486936569214 + }, + { + "auxiliary_loss_clip": 0.01151365, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.04739499, + "balance_loss_mlp": 1.01815689, + "epoch": 0.8432633920519449, + "flos": 25812374808960.0, + "grad_norm": 2.11706414987177, + "language_loss": 0.81914449, + "learning_rate": 2.5207558309709865e-07, + "loss": 0.84090823, + "num_input_tokens_seen": 151503700, + "step": 7013, + "time_per_iteration": 2.4803364276885986 + }, + { + "auxiliary_loss_clip": 0.01036122, + "auxiliary_loss_mlp": 0.00752945, + "balance_loss_clip": 1.00835383, + "balance_loss_mlp": 0.99995768, + "epoch": 0.8433836349425841, + "flos": 64959531592320.0, + "grad_norm": 0.6855319897730692, + "language_loss": 0.56307161, + "learning_rate": 2.516971402273065e-07, + "loss": 0.58096224, + "num_input_tokens_seen": 151569765, + "step": 7014, + "time_per_iteration": 3.092280864715576 + }, + { + "auxiliary_loss_clip": 0.01138566, + "auxiliary_loss_mlp": 0.0102131, + "balance_loss_clip": 1.04304528, + "balance_loss_mlp": 1.01442587, + "epoch": 0.8435038778332231, + "flos": 20229989483520.0, + "grad_norm": 2.0057004313342417, + "language_loss": 0.67897224, + "learning_rate": 2.513189625736687e-07, + "loss": 0.70057106, + "num_input_tokens_seen": 151586660, + "step": 7015, + "time_per_iteration": 2.4535529613494873 + }, + { + "auxiliary_loss_clip": 0.01127464, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.04397988, + "balance_loss_mlp": 1.02226651, + "epoch": 0.8436241207238622, + "flos": 20992229020800.0, + "grad_norm": 2.3479606485779536, + "language_loss": 0.71756548, + "learning_rate": 2.509410501935534e-07, + "loss": 0.73913723, + "num_input_tokens_seen": 151602295, + "step": 7016, + "time_per_iteration": 2.494269847869873 + }, + { + "auxiliary_loss_clip": 0.01137849, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.04366112, + "balance_loss_mlp": 1.01775014, + "epoch": 0.8437443636145013, + "flos": 14682257804160.0, + "grad_norm": 2.8334619111776687, + "language_loss": 0.75318938, + "learning_rate": 2.5056340314429116e-07, + "loss": 0.77481914, + "num_input_tokens_seen": 151619760, + "step": 7017, + "time_per_iteration": 2.4375545978546143 + }, + { + "auxiliary_loss_clip": 0.01112279, + "auxiliary_loss_mlp": 0.01026009, + "balance_loss_clip": 1.04012799, + "balance_loss_mlp": 1.01819444, + "epoch": 0.8438646065051404, + "flos": 21608814908160.0, + "grad_norm": 2.265978582369029, + "language_loss": 0.80613446, + "learning_rate": 2.5018602148316904e-07, + "loss": 0.82751739, + "num_input_tokens_seen": 151635795, + "step": 7018, + "time_per_iteration": 2.542299509048462 + }, + { + "auxiliary_loss_clip": 0.01164011, + "auxiliary_loss_mlp": 0.01024838, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.01825452, + "epoch": 0.8439848493957794, + "flos": 23289937194240.0, + "grad_norm": 2.1239860431179127, + "language_loss": 0.79964215, + "learning_rate": 2.498089052674359e-07, + "loss": 0.82153064, + "num_input_tokens_seen": 151653770, + "step": 7019, + "time_per_iteration": 2.4089088439941406 + }, + { + "auxiliary_loss_clip": 0.01151191, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.04663754, + "balance_loss_mlp": 1.02433455, + "epoch": 0.8441050922864186, + "flos": 19719339782400.0, + "grad_norm": 4.8655346410299085, + "language_loss": 0.75239706, + "learning_rate": 2.494320545543007e-07, + "loss": 0.77422404, + "num_input_tokens_seen": 151673340, + "step": 7020, + "time_per_iteration": 2.429816246032715 + }, + { + "auxiliary_loss_clip": 0.01168795, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.04787779, + "balance_loss_mlp": 1.01792574, + "epoch": 0.8442253351770577, + "flos": 21835268202240.0, + "grad_norm": 2.9438224088060516, + "language_loss": 0.66626304, + "learning_rate": 2.490554694009308e-07, + "loss": 0.68821001, + "num_input_tokens_seen": 151694205, + "step": 7021, + "time_per_iteration": 2.442040205001831 + }, + { + "auxiliary_loss_clip": 0.01154134, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.04513526, + "balance_loss_mlp": 1.02133143, + "epoch": 0.8443455780676967, + "flos": 34346365447680.0, + "grad_norm": 1.690566373744392, + "language_loss": 0.78432953, + "learning_rate": 2.4867914986445426e-07, + "loss": 0.80615127, + "num_input_tokens_seen": 151716595, + "step": 7022, + "time_per_iteration": 2.5730972290039062 + }, + { + "auxiliary_loss_clip": 0.01139241, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.04228497, + "balance_loss_mlp": 1.01833272, + "epoch": 0.8444658209583359, + "flos": 48214599281280.0, + "grad_norm": 1.8828106901096475, + "language_loss": 0.71129698, + "learning_rate": 2.483030960019581e-07, + "loss": 0.73293775, + "num_input_tokens_seen": 151740525, + "step": 7023, + "time_per_iteration": 2.7226295471191406 + }, + { + "auxiliary_loss_clip": 0.01018381, + "auxiliary_loss_mlp": 0.01003795, + "balance_loss_clip": 1.0081439, + "balance_loss_mlp": 1.00277555, + "epoch": 0.8445860638489749, + "flos": 68484773105280.0, + "grad_norm": 0.7366740590603003, + "language_loss": 0.55469286, + "learning_rate": 2.479273078704891e-07, + "loss": 0.57491457, + "num_input_tokens_seen": 151793890, + "step": 7024, + "time_per_iteration": 2.962144136428833 + }, + { + "auxiliary_loss_clip": 0.01014278, + "auxiliary_loss_mlp": 0.01003095, + "balance_loss_clip": 1.01141953, + "balance_loss_mlp": 1.00209367, + "epoch": 0.844706306739614, + "flos": 62833331882880.0, + "grad_norm": 0.7809757381921016, + "language_loss": 0.6473707, + "learning_rate": 2.475517855270552e-07, + "loss": 0.66754442, + "num_input_tokens_seen": 151853970, + "step": 7025, + "time_per_iteration": 3.0823192596435547 + }, + { + "auxiliary_loss_clip": 0.0116425, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.04844451, + "balance_loss_mlp": 1.01764858, + "epoch": 0.8448265496302532, + "flos": 14976114969600.0, + "grad_norm": 1.8202925918147292, + "language_loss": 0.72594321, + "learning_rate": 2.4717652902862143e-07, + "loss": 0.74782872, + "num_input_tokens_seen": 151872945, + "step": 7026, + "time_per_iteration": 2.411222457885742 + }, + { + "auxiliary_loss_clip": 0.01140705, + "auxiliary_loss_mlp": 0.01023645, + "balance_loss_clip": 1.04426026, + "balance_loss_mlp": 1.01702857, + "epoch": 0.8449467925208922, + "flos": 23441265192960.0, + "grad_norm": 1.617793512075884, + "language_loss": 0.81420934, + "learning_rate": 2.4680153843211495e-07, + "loss": 0.8358528, + "num_input_tokens_seen": 151892875, + "step": 7027, + "time_per_iteration": 2.5046610832214355 + }, + { + "auxiliary_loss_clip": 0.01136664, + "auxiliary_loss_mlp": 0.01022622, + "balance_loss_clip": 1.04533029, + "balance_loss_mlp": 1.01553178, + "epoch": 0.8450670354115313, + "flos": 22748045639040.0, + "grad_norm": 1.659828788536089, + "language_loss": 0.72587234, + "learning_rate": 2.464268137944212e-07, + "loss": 0.74746519, + "num_input_tokens_seen": 151914170, + "step": 7028, + "time_per_iteration": 3.3836312294006348 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.0400908, + "balance_loss_mlp": 1.01974404, + "epoch": 0.8451872783021703, + "flos": 29825571605760.0, + "grad_norm": 2.9001062545065763, + "language_loss": 0.78255951, + "learning_rate": 2.46052355172385e-07, + "loss": 0.80383521, + "num_input_tokens_seen": 151932210, + "step": 7029, + "time_per_iteration": 3.48899507522583 + }, + { + "auxiliary_loss_clip": 0.01164604, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.04637384, + "balance_loss_mlp": 1.02118921, + "epoch": 0.8453075211928095, + "flos": 21870029589120.0, + "grad_norm": 1.7236844803737197, + "language_loss": 0.74734879, + "learning_rate": 2.456781626228128e-07, + "loss": 0.76928246, + "num_input_tokens_seen": 151951715, + "step": 7030, + "time_per_iteration": 3.3004095554351807 + }, + { + "auxiliary_loss_clip": 0.01022387, + "auxiliary_loss_mlp": 0.00753187, + "balance_loss_clip": 1.00737798, + "balance_loss_mlp": 0.9999218, + "epoch": 0.8454277640834486, + "flos": 58751869288320.0, + "grad_norm": 0.9131877959224518, + "language_loss": 0.66333985, + "learning_rate": 2.453042362024675e-07, + "loss": 0.6810956, + "num_input_tokens_seen": 152004960, + "step": 7031, + "time_per_iteration": 3.135413885116577 + }, + { + "auxiliary_loss_clip": 0.01163056, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.04623342, + "balance_loss_mlp": 1.01947212, + "epoch": 0.8455480069740876, + "flos": 27090076469760.0, + "grad_norm": 1.6454883883613196, + "language_loss": 0.73299348, + "learning_rate": 2.449305759680751e-07, + "loss": 0.75488567, + "num_input_tokens_seen": 152026285, + "step": 7032, + "time_per_iteration": 2.4569928646087646 + }, + { + "auxiliary_loss_clip": 0.01121787, + "auxiliary_loss_mlp": 0.01026833, + "balance_loss_clip": 1.04415393, + "balance_loss_mlp": 1.01988935, + "epoch": 0.8456682498647268, + "flos": 27198670262400.0, + "grad_norm": 1.5675712988851356, + "language_loss": 0.75331032, + "learning_rate": 2.445571819763188e-07, + "loss": 0.77479649, + "num_input_tokens_seen": 152048585, + "step": 7033, + "time_per_iteration": 2.6643624305725098 + }, + { + "auxiliary_loss_clip": 0.01164267, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0476284, + "balance_loss_mlp": 1.02287579, + "epoch": 0.8457884927553658, + "flos": 20631901737600.0, + "grad_norm": 1.77588912014989, + "language_loss": 0.58323652, + "learning_rate": 2.4418405428384227e-07, + "loss": 0.60518217, + "num_input_tokens_seen": 152068795, + "step": 7034, + "time_per_iteration": 2.436022996902466 + }, + { + "auxiliary_loss_clip": 0.0116601, + "auxiliary_loss_mlp": 0.00761763, + "balance_loss_clip": 1.04790545, + "balance_loss_mlp": 1.00046527, + "epoch": 0.8459087356460049, + "flos": 15299023259520.0, + "grad_norm": 1.7307936949013905, + "language_loss": 0.71941221, + "learning_rate": 2.4381119294724864e-07, + "loss": 0.7386899, + "num_input_tokens_seen": 152086240, + "step": 7035, + "time_per_iteration": 2.3920671939849854 + }, + { + "auxiliary_loss_clip": 0.01165406, + "auxiliary_loss_mlp": 0.01021897, + "balance_loss_clip": 1.04743385, + "balance_loss_mlp": 1.01508093, + "epoch": 0.846028978536644, + "flos": 18843155326080.0, + "grad_norm": 3.6283295659671184, + "language_loss": 0.53970593, + "learning_rate": 2.434385980231004e-07, + "loss": 0.56157893, + "num_input_tokens_seen": 152105080, + "step": 7036, + "time_per_iteration": 3.1459147930145264 + }, + { + "auxiliary_loss_clip": 0.01150569, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.04591298, + "balance_loss_mlp": 1.02069736, + "epoch": 0.8461492214272831, + "flos": 52661740285440.0, + "grad_norm": 1.492225457134181, + "language_loss": 0.6556018, + "learning_rate": 2.4306626956792043e-07, + "loss": 0.67738652, + "num_input_tokens_seen": 152130025, + "step": 7037, + "time_per_iteration": 2.719534158706665 + }, + { + "auxiliary_loss_clip": 0.01148756, + "auxiliary_loss_mlp": 0.01024081, + "balance_loss_clip": 1.04312038, + "balance_loss_mlp": 1.0173254, + "epoch": 0.8462694643179222, + "flos": 18588405093120.0, + "grad_norm": 1.677755895706697, + "language_loss": 0.75646615, + "learning_rate": 2.4269420763819017e-07, + "loss": 0.77819455, + "num_input_tokens_seen": 152148070, + "step": 7038, + "time_per_iteration": 2.429368257522583 + }, + { + "auxiliary_loss_clip": 0.01147268, + "auxiliary_loss_mlp": 0.01022207, + "balance_loss_clip": 1.04502559, + "balance_loss_mlp": 1.01565695, + "epoch": 0.8463897072085613, + "flos": 24387080163840.0, + "grad_norm": 2.3201240319917944, + "language_loss": 0.83149451, + "learning_rate": 2.4232241229035223e-07, + "loss": 0.85318923, + "num_input_tokens_seen": 152165825, + "step": 7039, + "time_per_iteration": 2.4675028324127197 + }, + { + "auxiliary_loss_clip": 0.01052888, + "auxiliary_loss_mlp": 0.01000745, + "balance_loss_clip": 1.0079565, + "balance_loss_mlp": 0.99983925, + "epoch": 0.8465099500992004, + "flos": 68702140258560.0, + "grad_norm": 0.8032244963941125, + "language_loss": 0.5678364, + "learning_rate": 2.419508835808064e-07, + "loss": 0.58837271, + "num_input_tokens_seen": 152222380, + "step": 7040, + "time_per_iteration": 2.9750545024871826 + }, + { + "auxiliary_loss_clip": 0.0113739, + "auxiliary_loss_mlp": 0.01021965, + "balance_loss_clip": 1.04579735, + "balance_loss_mlp": 1.01506317, + "epoch": 0.8466301929898394, + "flos": 13735724561280.0, + "grad_norm": 1.9433629375392678, + "language_loss": 0.6314559, + "learning_rate": 2.415796215659134e-07, + "loss": 0.65304947, + "num_input_tokens_seen": 152239085, + "step": 7041, + "time_per_iteration": 2.4568138122558594 + }, + { + "auxiliary_loss_clip": 0.011259, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.04076469, + "balance_loss_mlp": 1.02414083, + "epoch": 0.8467504358804786, + "flos": 19241260738560.0, + "grad_norm": 2.011923393267298, + "language_loss": 0.77123839, + "learning_rate": 2.412086263019939e-07, + "loss": 0.79280818, + "num_input_tokens_seen": 152257110, + "step": 7042, + "time_per_iteration": 2.4937891960144043 + }, + { + "auxiliary_loss_clip": 0.01160337, + "auxiliary_loss_mlp": 0.01022558, + "balance_loss_clip": 1.04707003, + "balance_loss_mlp": 1.01592696, + "epoch": 0.8468706787711177, + "flos": 21324115710720.0, + "grad_norm": 2.320967958914889, + "language_loss": 0.79747272, + "learning_rate": 2.408378978453276e-07, + "loss": 0.81930166, + "num_input_tokens_seen": 152277230, + "step": 7043, + "time_per_iteration": 2.4199469089508057 + }, + { + "auxiliary_loss_clip": 0.01053078, + "auxiliary_loss_mlp": 0.01002209, + "balance_loss_clip": 1.00811934, + "balance_loss_mlp": 1.00129664, + "epoch": 0.8469909216617567, + "flos": 64877439058560.0, + "grad_norm": 0.8116887091471326, + "language_loss": 0.63976562, + "learning_rate": 2.404674362521533e-07, + "loss": 0.66031849, + "num_input_tokens_seen": 152335725, + "step": 7044, + "time_per_iteration": 2.9290642738342285 + }, + { + "auxiliary_loss_clip": 0.01151659, + "auxiliary_loss_mlp": 0.0102219, + "balance_loss_clip": 1.04819751, + "balance_loss_mlp": 1.01595497, + "epoch": 0.8471111645523959, + "flos": 19280583152640.0, + "grad_norm": 2.2022597730265208, + "language_loss": 0.74563384, + "learning_rate": 2.4009724157866997e-07, + "loss": 0.76737237, + "num_input_tokens_seen": 152352785, + "step": 7045, + "time_per_iteration": 2.4344186782836914 + }, + { + "auxiliary_loss_clip": 0.0116297, + "auxiliary_loss_mlp": 0.01022478, + "balance_loss_clip": 1.04652834, + "balance_loss_mlp": 1.01587665, + "epoch": 0.8472314074430349, + "flos": 22015826893440.0, + "grad_norm": 2.0010858820738973, + "language_loss": 0.76488608, + "learning_rate": 2.3972731388103564e-07, + "loss": 0.78674054, + "num_input_tokens_seen": 152371265, + "step": 7046, + "time_per_iteration": 2.411935806274414 + }, + { + "auxiliary_loss_clip": 0.01005485, + "auxiliary_loss_mlp": 0.01002247, + "balance_loss_clip": 1.00797129, + "balance_loss_mlp": 1.00129282, + "epoch": 0.847351650333674, + "flos": 57882580243200.0, + "grad_norm": 0.8081796483934046, + "language_loss": 0.62394696, + "learning_rate": 2.393576532153687e-07, + "loss": 0.64402425, + "num_input_tokens_seen": 152435050, + "step": 7047, + "time_per_iteration": 3.3289384841918945 + }, + { + "auxiliary_loss_clip": 0.01049463, + "auxiliary_loss_mlp": 0.01000561, + "balance_loss_clip": 1.00717735, + "balance_loss_mlp": 0.99964905, + "epoch": 0.8474718932243132, + "flos": 41284238313600.0, + "grad_norm": 1.0148745035124487, + "language_loss": 0.57805216, + "learning_rate": 2.389882596377453e-07, + "loss": 0.59855247, + "num_input_tokens_seen": 152489315, + "step": 7048, + "time_per_iteration": 3.1335747241973877 + }, + { + "auxiliary_loss_clip": 0.01162949, + "auxiliary_loss_mlp": 0.01023258, + "balance_loss_clip": 1.04563689, + "balance_loss_mlp": 1.01646328, + "epoch": 0.8475921361149522, + "flos": 38180906974080.0, + "grad_norm": 1.783882695297717, + "language_loss": 0.76419449, + "learning_rate": 2.386191332042031e-07, + "loss": 0.78605652, + "num_input_tokens_seen": 152511210, + "step": 7049, + "time_per_iteration": 2.559741973876953 + }, + { + "auxiliary_loss_clip": 0.01170272, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.04890227, + "balance_loss_mlp": 1.02202296, + "epoch": 0.8477123790055913, + "flos": 25375054723200.0, + "grad_norm": 1.6930635494095527, + "language_loss": 0.72977793, + "learning_rate": 2.3825027397073794e-07, + "loss": 0.75177181, + "num_input_tokens_seen": 152531685, + "step": 7050, + "time_per_iteration": 2.455000638961792 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01024604, + "balance_loss_clip": 1.04783297, + "balance_loss_mlp": 1.01772869, + "epoch": 0.8478326218962304, + "flos": 30225185389440.0, + "grad_norm": 1.9775603583509382, + "language_loss": 0.66726625, + "learning_rate": 2.3788168199330515e-07, + "loss": 0.68900353, + "num_input_tokens_seen": 152553245, + "step": 7051, + "time_per_iteration": 2.502920389175415 + }, + { + "auxiliary_loss_clip": 0.01121482, + "auxiliary_loss_mlp": 0.01023664, + "balance_loss_clip": 1.03889132, + "balance_loss_mlp": 1.01621366, + "epoch": 0.8479528647868695, + "flos": 38213800853760.0, + "grad_norm": 1.5450701602903603, + "language_loss": 0.72626805, + "learning_rate": 2.3751335732782074e-07, + "loss": 0.74771953, + "num_input_tokens_seen": 152574505, + "step": 7052, + "time_per_iteration": 2.6080124378204346 + }, + { + "auxiliary_loss_clip": 0.01151635, + "auxiliary_loss_mlp": 0.01023658, + "balance_loss_clip": 1.04819822, + "balance_loss_mlp": 1.01690161, + "epoch": 0.8480731076775085, + "flos": 20957790856320.0, + "grad_norm": 1.9118088469892158, + "language_loss": 0.79593438, + "learning_rate": 2.371453000301582e-07, + "loss": 0.81768733, + "num_input_tokens_seen": 152593190, + "step": 7053, + "time_per_iteration": 2.4335508346557617 + }, + { + "auxiliary_loss_clip": 0.01119364, + "auxiliary_loss_mlp": 0.01020221, + "balance_loss_clip": 1.04296207, + "balance_loss_mlp": 1.01344085, + "epoch": 0.8481933505681477, + "flos": 32596510487040.0, + "grad_norm": 1.6431137248093355, + "language_loss": 0.7410087, + "learning_rate": 2.3677751015615222e-07, + "loss": 0.76240456, + "num_input_tokens_seen": 152615265, + "step": 7054, + "time_per_iteration": 2.5895867347717285 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.04154372, + "balance_loss_mlp": 1.02259183, + "epoch": 0.8483135934587868, + "flos": 20741177888640.0, + "grad_norm": 4.149614036353006, + "language_loss": 0.85451144, + "learning_rate": 2.3640998776159593e-07, + "loss": 0.87606359, + "num_input_tokens_seen": 152632770, + "step": 7055, + "time_per_iteration": 4.348072290420532 + }, + { + "auxiliary_loss_clip": 0.01137351, + "auxiliary_loss_mlp": 0.01022682, + "balance_loss_clip": 1.04526162, + "balance_loss_mlp": 1.01656961, + "epoch": 0.8484338363494258, + "flos": 21653057485440.0, + "grad_norm": 1.6847481393658448, + "language_loss": 0.81471545, + "learning_rate": 2.3604273290224253e-07, + "loss": 0.83631575, + "num_input_tokens_seen": 152653485, + "step": 7056, + "time_per_iteration": 2.500746965408325 + }, + { + "auxiliary_loss_clip": 0.01140739, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.04679561, + "balance_loss_mlp": 1.02154207, + "epoch": 0.848554079240065, + "flos": 15013964926080.0, + "grad_norm": 3.531464086747567, + "language_loss": 0.74845701, + "learning_rate": 2.356757456338039e-07, + "loss": 0.77015722, + "num_input_tokens_seen": 152670970, + "step": 7057, + "time_per_iteration": 3.2922630310058594 + }, + { + "auxiliary_loss_clip": 0.01039997, + "auxiliary_loss_mlp": 0.01001309, + "balance_loss_clip": 1.01098704, + "balance_loss_mlp": 1.00036764, + "epoch": 0.848674322130704, + "flos": 68060453742720.0, + "grad_norm": 0.7689343665705757, + "language_loss": 0.59039354, + "learning_rate": 2.3530902601195147e-07, + "loss": 0.61080658, + "num_input_tokens_seen": 152739460, + "step": 7058, + "time_per_iteration": 3.1545004844665527 + }, + { + "auxiliary_loss_clip": 0.01147982, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.04442942, + "balance_loss_mlp": 1.0201664, + "epoch": 0.8487945650213431, + "flos": 18475788977280.0, + "grad_norm": 6.813239192788781, + "language_loss": 0.79047465, + "learning_rate": 2.34942574092317e-07, + "loss": 0.8122263, + "num_input_tokens_seen": 152754710, + "step": 7059, + "time_per_iteration": 2.401810646057129 + }, + { + "auxiliary_loss_clip": 0.01154059, + "auxiliary_loss_mlp": 0.01025475, + "balance_loss_clip": 1.04601407, + "balance_loss_mlp": 1.01834035, + "epoch": 0.8489148079119821, + "flos": 23473189405440.0, + "grad_norm": 2.474307463828105, + "language_loss": 0.76760042, + "learning_rate": 2.3457638993049045e-07, + "loss": 0.78939581, + "num_input_tokens_seen": 152772700, + "step": 7060, + "time_per_iteration": 2.512019157409668 + }, + { + "auxiliary_loss_clip": 0.01099105, + "auxiliary_loss_mlp": 0.01023676, + "balance_loss_clip": 1.04408264, + "balance_loss_mlp": 1.01621664, + "epoch": 0.8490350508026213, + "flos": 19937604775680.0, + "grad_norm": 1.9053257710084055, + "language_loss": 0.64234185, + "learning_rate": 2.3421047358202252e-07, + "loss": 0.66356963, + "num_input_tokens_seen": 152791550, + "step": 7061, + "time_per_iteration": 2.5750207901000977 + }, + { + "auxiliary_loss_clip": 0.01152972, + "auxiliary_loss_mlp": 0.01026238, + "balance_loss_clip": 1.04677105, + "balance_loss_mlp": 1.01937747, + "epoch": 0.8491552936932604, + "flos": 24279958828800.0, + "grad_norm": 2.2386390924518813, + "language_loss": 0.83409441, + "learning_rate": 2.3384482510242144e-07, + "loss": 0.85588646, + "num_input_tokens_seen": 152809410, + "step": 7062, + "time_per_iteration": 2.45922589302063 + }, + { + "auxiliary_loss_clip": 0.01166247, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.04649138, + "balance_loss_mlp": 1.01872015, + "epoch": 0.8492755365838994, + "flos": 22522526098560.0, + "grad_norm": 2.5269522640472566, + "language_loss": 0.77260256, + "learning_rate": 2.3347944454715575e-07, + "loss": 0.79452431, + "num_input_tokens_seen": 152825800, + "step": 7063, + "time_per_iteration": 3.300910472869873 + }, + { + "auxiliary_loss_clip": 0.01168376, + "auxiliary_loss_mlp": 0.01023725, + "balance_loss_clip": 1.04738522, + "balance_loss_mlp": 1.01630998, + "epoch": 0.8493957794745386, + "flos": 26980441182720.0, + "grad_norm": 1.7028184501106434, + "language_loss": 0.67474258, + "learning_rate": 2.331143319716542e-07, + "loss": 0.69666356, + "num_input_tokens_seen": 152845330, + "step": 7064, + "time_per_iteration": 2.4657676219940186 + }, + { + "auxiliary_loss_clip": 0.01126241, + "auxiliary_loss_mlp": 0.01024094, + "balance_loss_clip": 1.04288673, + "balance_loss_mlp": 1.0171051, + "epoch": 0.8495160223651776, + "flos": 29861985018240.0, + "grad_norm": 2.0395667671375084, + "language_loss": 0.66012293, + "learning_rate": 2.3274948743130363e-07, + "loss": 0.68162626, + "num_input_tokens_seen": 152865165, + "step": 7065, + "time_per_iteration": 2.5739710330963135 + }, + { + "auxiliary_loss_clip": 0.01164269, + "auxiliary_loss_mlp": 0.01022455, + "balance_loss_clip": 1.04554188, + "balance_loss_mlp": 1.01505852, + "epoch": 0.8496362652558167, + "flos": 23075443128960.0, + "grad_norm": 1.573993371533445, + "language_loss": 0.79392433, + "learning_rate": 2.3238491098145085e-07, + "loss": 0.81579161, + "num_input_tokens_seen": 152884695, + "step": 7066, + "time_per_iteration": 2.4210197925567627 + }, + { + "auxiliary_loss_clip": 0.011484, + "auxiliary_loss_mlp": 0.01021955, + "balance_loss_clip": 1.04456878, + "balance_loss_mlp": 1.01509786, + "epoch": 0.8497565081464559, + "flos": 14609107756800.0, + "grad_norm": 2.121251356048633, + "language_loss": 0.73573291, + "learning_rate": 2.3202060267740141e-07, + "loss": 0.75743645, + "num_input_tokens_seen": 152902220, + "step": 7067, + "time_per_iteration": 2.4155685901641846 + }, + { + "auxiliary_loss_clip": 0.0110278, + "auxiliary_loss_mlp": 0.01019291, + "balance_loss_clip": 1.03824425, + "balance_loss_mlp": 1.01261258, + "epoch": 0.8498767510370949, + "flos": 21136446126720.0, + "grad_norm": 2.1323481787273066, + "language_loss": 0.77249926, + "learning_rate": 2.3165656257442044e-07, + "loss": 0.79372001, + "num_input_tokens_seen": 152920740, + "step": 7068, + "time_per_iteration": 2.5306179523468018 + }, + { + "auxiliary_loss_clip": 0.01147328, + "auxiliary_loss_mlp": 0.01020414, + "balance_loss_clip": 1.04548645, + "balance_loss_mlp": 1.01403379, + "epoch": 0.849996993927734, + "flos": 23654538195840.0, + "grad_norm": 2.1418123922771297, + "language_loss": 0.89970219, + "learning_rate": 2.31292790727734e-07, + "loss": 0.92137963, + "num_input_tokens_seen": 152938305, + "step": 7069, + "time_per_iteration": 2.462441921234131 + }, + { + "auxiliary_loss_clip": 0.01161451, + "auxiliary_loss_mlp": 0.01022922, + "balance_loss_clip": 1.04514194, + "balance_loss_mlp": 1.01657152, + "epoch": 0.8501172368183731, + "flos": 20558069331840.0, + "grad_norm": 2.718824295277621, + "language_loss": 0.80245382, + "learning_rate": 2.3092928719252392e-07, + "loss": 0.82429755, + "num_input_tokens_seen": 152956705, + "step": 7070, + "time_per_iteration": 2.419555187225342 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.01024365, + "balance_loss_clip": 1.04458272, + "balance_loss_mlp": 1.01728415, + "epoch": 0.8502374797090122, + "flos": 22272624201600.0, + "grad_norm": 1.8605462975381586, + "language_loss": 0.78503716, + "learning_rate": 2.3056605202393475e-07, + "loss": 0.80676532, + "num_input_tokens_seen": 152974265, + "step": 7071, + "time_per_iteration": 2.4259607791900635 + }, + { + "auxiliary_loss_clip": 0.01146862, + "auxiliary_loss_mlp": 0.0076227, + "balance_loss_clip": 1.04307997, + "balance_loss_mlp": 1.00040865, + "epoch": 0.8503577225996513, + "flos": 23659817495040.0, + "grad_norm": 1.741231877719285, + "language_loss": 0.66835445, + "learning_rate": 2.3020308527706888e-07, + "loss": 0.68744576, + "num_input_tokens_seen": 152993680, + "step": 7072, + "time_per_iteration": 2.4568064212799072 + }, + { + "auxiliary_loss_clip": 0.01141689, + "auxiliary_loss_mlp": 0.01024764, + "balance_loss_clip": 1.04329515, + "balance_loss_mlp": 1.01753986, + "epoch": 0.8504779654902904, + "flos": 26758513002240.0, + "grad_norm": 1.5741104597957458, + "language_loss": 0.88587922, + "learning_rate": 2.2984038700698715e-07, + "loss": 0.90754372, + "num_input_tokens_seen": 153012990, + "step": 7073, + "time_per_iteration": 2.512040615081787 + }, + { + "auxiliary_loss_clip": 0.0114737, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.04633343, + "balance_loss_mlp": 1.02081561, + "epoch": 0.8505982083809295, + "flos": 26468247196800.0, + "grad_norm": 2.330735734679678, + "language_loss": 0.78830302, + "learning_rate": 2.2947795726871222e-07, + "loss": 0.8100577, + "num_input_tokens_seen": 153034015, + "step": 7074, + "time_per_iteration": 2.471318483352661 + }, + { + "auxiliary_loss_clip": 0.01151886, + "auxiliary_loss_mlp": 0.00761997, + "balance_loss_clip": 1.05040956, + "balance_loss_mlp": 1.00046706, + "epoch": 0.8507184512715685, + "flos": 20303390926080.0, + "grad_norm": 1.6814630545809488, + "language_loss": 0.85835367, + "learning_rate": 2.2911579611722253e-07, + "loss": 0.87749255, + "num_input_tokens_seen": 153053160, + "step": 7075, + "time_per_iteration": 2.4278669357299805 + }, + { + "auxiliary_loss_clip": 0.01134444, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.04330981, + "balance_loss_mlp": 1.01997757, + "epoch": 0.8508386941622077, + "flos": 19025186474880.0, + "grad_norm": 10.859556549340144, + "language_loss": 0.87158132, + "learning_rate": 2.2875390360745905e-07, + "loss": 0.89319694, + "num_input_tokens_seen": 153072565, + "step": 7076, + "time_per_iteration": 2.447910785675049 + }, + { + "auxiliary_loss_clip": 0.01127973, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.04248953, + "balance_loss_mlp": 1.01969528, + "epoch": 0.8509589370528468, + "flos": 16433405654400.0, + "grad_norm": 1.5811002385968285, + "language_loss": 0.77661842, + "learning_rate": 2.2839227979432008e-07, + "loss": 0.79816759, + "num_input_tokens_seen": 153090215, + "step": 7077, + "time_per_iteration": 2.4639763832092285 + }, + { + "auxiliary_loss_clip": 0.01137051, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.02009082, + "epoch": 0.8510791799434858, + "flos": 18259714713600.0, + "grad_norm": 1.8369294047695677, + "language_loss": 0.8523283, + "learning_rate": 2.2803092473266373e-07, + "loss": 0.87397212, + "num_input_tokens_seen": 153107740, + "step": 7078, + "time_per_iteration": 2.4514496326446533 + }, + { + "auxiliary_loss_clip": 0.01167681, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.04919112, + "balance_loss_mlp": 1.0211879, + "epoch": 0.851199422834125, + "flos": 23441372933760.0, + "grad_norm": 2.2591622548663004, + "language_loss": 0.87185091, + "learning_rate": 2.2766983847730724e-07, + "loss": 0.89380741, + "num_input_tokens_seen": 153127410, + "step": 7079, + "time_per_iteration": 2.4115891456604004 + }, + { + "auxiliary_loss_clip": 0.01130762, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.04164219, + "balance_loss_mlp": 1.02004004, + "epoch": 0.851319665724764, + "flos": 16289404030080.0, + "grad_norm": 2.1513353759889227, + "language_loss": 0.66395998, + "learning_rate": 2.2730902108302663e-07, + "loss": 0.68554127, + "num_input_tokens_seen": 153144325, + "step": 7080, + "time_per_iteration": 2.4805057048797607 + }, + { + "auxiliary_loss_clip": 0.011282, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.04047132, + "balance_loss_mlp": 1.01704419, + "epoch": 0.8514399086154031, + "flos": 18989347680000.0, + "grad_norm": 1.6018961236200482, + "language_loss": 0.68767834, + "learning_rate": 2.269484726045583e-07, + "loss": 0.70920593, + "num_input_tokens_seen": 153163240, + "step": 7081, + "time_per_iteration": 3.2784430980682373 + }, + { + "auxiliary_loss_clip": 0.01126317, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.04383409, + "balance_loss_mlp": 1.02173698, + "epoch": 0.8515601515060423, + "flos": 24571194301440.0, + "grad_norm": 1.706802448039936, + "language_loss": 0.79028738, + "learning_rate": 2.2658819309659672e-07, + "loss": 0.81184185, + "num_input_tokens_seen": 153183440, + "step": 7082, + "time_per_iteration": 3.2958412170410156 + }, + { + "auxiliary_loss_clip": 0.01135462, + "auxiliary_loss_mlp": 0.01020593, + "balance_loss_clip": 1.04651475, + "balance_loss_mlp": 1.01406932, + "epoch": 0.8516803943966813, + "flos": 19529443555200.0, + "grad_norm": 2.0187326952820426, + "language_loss": 0.84839016, + "learning_rate": 2.2622818261379706e-07, + "loss": 0.86995071, + "num_input_tokens_seen": 153200460, + "step": 7083, + "time_per_iteration": 2.4643161296844482 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.01789105, + "epoch": 0.8518006372873204, + "flos": 20265792364800.0, + "grad_norm": 1.9362268411796046, + "language_loss": 0.7506994, + "learning_rate": 2.2586844121077142e-07, + "loss": 0.77229297, + "num_input_tokens_seen": 153218970, + "step": 7084, + "time_per_iteration": 3.321122407913208 + }, + { + "auxiliary_loss_clip": 0.01109514, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03928852, + "balance_loss_mlp": 1.02259803, + "epoch": 0.8519208801779595, + "flos": 24133227770880.0, + "grad_norm": 1.823985184558145, + "language_loss": 0.72233599, + "learning_rate": 2.2550896894209215e-07, + "loss": 0.7437315, + "num_input_tokens_seen": 153238485, + "step": 7085, + "time_per_iteration": 2.5603764057159424 + }, + { + "auxiliary_loss_clip": 0.01012651, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 1.01078892, + "balance_loss_mlp": 0.99989742, + "epoch": 0.8520411230685986, + "flos": 63035223252480.0, + "grad_norm": 0.6806603840275841, + "language_loss": 0.56611848, + "learning_rate": 2.2514976586229184e-07, + "loss": 0.58625305, + "num_input_tokens_seen": 153306430, + "step": 7086, + "time_per_iteration": 3.2596099376678467 + }, + { + "auxiliary_loss_clip": 0.01052337, + "auxiliary_loss_mlp": 0.01001103, + "balance_loss_clip": 1.00837684, + "balance_loss_mlp": 1.00020313, + "epoch": 0.8521613659592376, + "flos": 65836865283840.0, + "grad_norm": 0.7580509956980961, + "language_loss": 0.5481354, + "learning_rate": 2.247908320258609e-07, + "loss": 0.5686698, + "num_input_tokens_seen": 153366520, + "step": 7087, + "time_per_iteration": 3.0005438327789307 + }, + { + "auxiliary_loss_clip": 0.01101327, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.04148126, + "balance_loss_mlp": 1.01699662, + "epoch": 0.8522816088498768, + "flos": 23112323418240.0, + "grad_norm": 2.148926769257755, + "language_loss": 0.7967459, + "learning_rate": 2.2443216748724914e-07, + "loss": 0.81800264, + "num_input_tokens_seen": 153387230, + "step": 7088, + "time_per_iteration": 2.5657765865325928 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.00761629, + "balance_loss_clip": 1.04673874, + "balance_loss_mlp": 1.00036407, + "epoch": 0.8524018517405159, + "flos": 31758140073600.0, + "grad_norm": 2.414209224319873, + "language_loss": 0.74304509, + "learning_rate": 2.2407377230086588e-07, + "loss": 0.76220644, + "num_input_tokens_seen": 153409585, + "step": 7089, + "time_per_iteration": 3.2796361446380615 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01020596, + "balance_loss_clip": 1.04518378, + "balance_loss_mlp": 1.01373291, + "epoch": 0.8525220946311549, + "flos": 18690318956160.0, + "grad_norm": 1.8693936582278239, + "language_loss": 0.83429128, + "learning_rate": 2.23715646521079e-07, + "loss": 0.85571712, + "num_input_tokens_seen": 153427105, + "step": 7090, + "time_per_iteration": 2.5232646465301514 + }, + { + "auxiliary_loss_clip": 0.01153943, + "auxiliary_loss_mlp": 0.00761992, + "balance_loss_clip": 1.04529393, + "balance_loss_mlp": 1.00046611, + "epoch": 0.852642337521794, + "flos": 21793216354560.0, + "grad_norm": 1.9545998235180504, + "language_loss": 0.83970618, + "learning_rate": 2.2335779020221724e-07, + "loss": 0.85886556, + "num_input_tokens_seen": 153443725, + "step": 7091, + "time_per_iteration": 2.4448933601379395 + }, + { + "auxiliary_loss_clip": 0.0105643, + "auxiliary_loss_mlp": 0.01002323, + "balance_loss_clip": 1.01856983, + "balance_loss_mlp": 1.00123203, + "epoch": 0.8527625804124331, + "flos": 69040132260480.0, + "grad_norm": 0.7988800743528195, + "language_loss": 0.56406486, + "learning_rate": 2.2300020339856497e-07, + "loss": 0.5846523, + "num_input_tokens_seen": 153506410, + "step": 7092, + "time_per_iteration": 3.0925779342651367 + }, + { + "auxiliary_loss_clip": 0.01132005, + "auxiliary_loss_mlp": 0.01022019, + "balance_loss_clip": 1.04318929, + "balance_loss_mlp": 1.01496196, + "epoch": 0.8528828233030722, + "flos": 26979399688320.0, + "grad_norm": 2.170279768260591, + "language_loss": 0.78223598, + "learning_rate": 2.2264288616436966e-07, + "loss": 0.80377626, + "num_input_tokens_seen": 153526665, + "step": 7093, + "time_per_iteration": 2.554919481277466 + }, + { + "auxiliary_loss_clip": 0.01129065, + "auxiliary_loss_mlp": 0.01025349, + "balance_loss_clip": 1.04226148, + "balance_loss_mlp": 1.01838422, + "epoch": 0.8530030661937112, + "flos": 17487598936320.0, + "grad_norm": 2.3092569011024198, + "language_loss": 0.72660351, + "learning_rate": 2.222858385538351e-07, + "loss": 0.74814761, + "num_input_tokens_seen": 153543465, + "step": 7094, + "time_per_iteration": 2.4423117637634277 + }, + { + "auxiliary_loss_clip": 0.01146373, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.04410696, + "balance_loss_mlp": 1.02111912, + "epoch": 0.8531233090843504, + "flos": 22160798184960.0, + "grad_norm": 3.44874137753195, + "language_loss": 0.68056417, + "learning_rate": 2.2192906062112527e-07, + "loss": 0.70230913, + "num_input_tokens_seen": 153563340, + "step": 7095, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01163591, + "auxiliary_loss_mlp": 0.01021055, + "balance_loss_clip": 1.04470897, + "balance_loss_mlp": 1.01439142, + "epoch": 0.8532435519749895, + "flos": 37635388145280.0, + "grad_norm": 1.584784203027402, + "language_loss": 0.70633185, + "learning_rate": 2.2157255242036377e-07, + "loss": 0.72817832, + "num_input_tokens_seen": 153587005, + "step": 7096, + "time_per_iteration": 2.5972514152526855 + }, + { + "auxiliary_loss_clip": 0.01118378, + "auxiliary_loss_mlp": 0.01025146, + "balance_loss_clip": 1.04140472, + "balance_loss_mlp": 1.01800263, + "epoch": 0.8533637948656285, + "flos": 21398163598080.0, + "grad_norm": 2.844418129219226, + "language_loss": 0.74186468, + "learning_rate": 2.2121631400563135e-07, + "loss": 0.76329982, + "num_input_tokens_seen": 153606835, + "step": 7097, + "time_per_iteration": 2.513479709625244 + }, + { + "auxiliary_loss_clip": 0.01049414, + "auxiliary_loss_mlp": 0.01001437, + "balance_loss_clip": 1.01043344, + "balance_loss_mlp": 1.00050151, + "epoch": 0.8534840377562677, + "flos": 53345122490880.0, + "grad_norm": 0.7640189036336291, + "language_loss": 0.53004527, + "learning_rate": 2.208603454309701e-07, + "loss": 0.55055374, + "num_input_tokens_seen": 153664925, + "step": 7098, + "time_per_iteration": 3.006751775741577 + }, + { + "auxiliary_loss_clip": 0.01109014, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.04181433, + "balance_loss_mlp": 1.01755607, + "epoch": 0.8536042806469067, + "flos": 20814148368000.0, + "grad_norm": 2.2556983047475723, + "language_loss": 0.70660621, + "learning_rate": 2.2050464675037994e-07, + "loss": 0.72795033, + "num_input_tokens_seen": 153683550, + "step": 7099, + "time_per_iteration": 2.573735237121582 + }, + { + "auxiliary_loss_clip": 0.01136092, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.0445708, + "balance_loss_mlp": 1.01921868, + "epoch": 0.8537245235375458, + "flos": 24681368292480.0, + "grad_norm": 2.2744954366880434, + "language_loss": 0.72964811, + "learning_rate": 2.2014921801782016e-07, + "loss": 0.75127357, + "num_input_tokens_seen": 153703040, + "step": 7100, + "time_per_iteration": 2.505391836166382 + }, + { + "auxiliary_loss_clip": 0.01136182, + "auxiliary_loss_mlp": 0.0102226, + "balance_loss_clip": 1.03957009, + "balance_loss_mlp": 1.01538169, + "epoch": 0.853844766428185, + "flos": 24384817607040.0, + "grad_norm": 1.8797281719053744, + "language_loss": 0.74105167, + "learning_rate": 2.1979405928720872e-07, + "loss": 0.76263613, + "num_input_tokens_seen": 153722695, + "step": 7101, + "time_per_iteration": 2.5264322757720947 + }, + { + "auxiliary_loss_clip": 0.01139696, + "auxiliary_loss_mlp": 0.01021695, + "balance_loss_clip": 1.04485679, + "balance_loss_mlp": 1.01508152, + "epoch": 0.853965009318824, + "flos": 20955707867520.0, + "grad_norm": 1.5139654686054798, + "language_loss": 0.79297006, + "learning_rate": 2.1943917061242257e-07, + "loss": 0.81458396, + "num_input_tokens_seen": 153742550, + "step": 7102, + "time_per_iteration": 2.4963033199310303 + }, + { + "auxiliary_loss_clip": 0.01158273, + "auxiliary_loss_mlp": 0.00762013, + "balance_loss_clip": 1.04658568, + "balance_loss_mlp": 1.00049496, + "epoch": 0.8540852522094631, + "flos": 24201816791040.0, + "grad_norm": 1.7341689569974803, + "language_loss": 0.66479659, + "learning_rate": 2.1908455204729903e-07, + "loss": 0.68399942, + "num_input_tokens_seen": 153761700, + "step": 7103, + "time_per_iteration": 2.471831798553467 + }, + { + "auxiliary_loss_clip": 0.01136763, + "auxiliary_loss_mlp": 0.0102547, + "balance_loss_clip": 1.04289103, + "balance_loss_mlp": 1.01815653, + "epoch": 0.8542054951001022, + "flos": 25082921410560.0, + "grad_norm": 2.693554586123264, + "language_loss": 0.78179169, + "learning_rate": 2.1873020364563265e-07, + "loss": 0.80341399, + "num_input_tokens_seen": 153780765, + "step": 7104, + "time_per_iteration": 2.512988567352295 + }, + { + "auxiliary_loss_clip": 0.01146736, + "auxiliary_loss_mlp": 0.0102334, + "balance_loss_clip": 1.04565895, + "balance_loss_mlp": 1.01646447, + "epoch": 0.8543257379907413, + "flos": 24316551809280.0, + "grad_norm": 24.71493465391521, + "language_loss": 0.75604618, + "learning_rate": 2.183761254611789e-07, + "loss": 0.77774698, + "num_input_tokens_seen": 153801090, + "step": 7105, + "time_per_iteration": 2.463827610015869 + }, + { + "auxiliary_loss_clip": 0.01149472, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.04569709, + "balance_loss_mlp": 1.02007318, + "epoch": 0.8544459808813804, + "flos": 55286630467200.0, + "grad_norm": 2.015847224516383, + "language_loss": 0.70016515, + "learning_rate": 2.1802231754764987e-07, + "loss": 0.72193128, + "num_input_tokens_seen": 153826530, + "step": 7106, + "time_per_iteration": 2.7713136672973633 + }, + { + "auxiliary_loss_clip": 0.01136835, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.04165757, + "balance_loss_mlp": 1.01954257, + "epoch": 0.8545662237720195, + "flos": 25776248705280.0, + "grad_norm": 1.8350454229343163, + "language_loss": 0.76465547, + "learning_rate": 2.17668779958718e-07, + "loss": 0.78629619, + "num_input_tokens_seen": 153849110, + "step": 7107, + "time_per_iteration": 2.527698516845703 + }, + { + "auxiliary_loss_clip": 0.01164663, + "auxiliary_loss_mlp": 0.01023817, + "balance_loss_clip": 1.04628968, + "balance_loss_mlp": 1.01670957, + "epoch": 0.8546864666626586, + "flos": 11108320427520.0, + "grad_norm": 2.8153978998934415, + "language_loss": 0.80400735, + "learning_rate": 2.1731551274801553e-07, + "loss": 0.82589221, + "num_input_tokens_seen": 153865550, + "step": 7108, + "time_per_iteration": 3.35726261138916 + }, + { + "auxiliary_loss_clip": 0.01140238, + "auxiliary_loss_mlp": 0.01024911, + "balance_loss_clip": 1.04658413, + "balance_loss_mlp": 1.01740098, + "epoch": 0.8548067095532976, + "flos": 25520169669120.0, + "grad_norm": 2.9064952533683606, + "language_loss": 0.61815995, + "learning_rate": 2.169625159691324e-07, + "loss": 0.63981146, + "num_input_tokens_seen": 153885425, + "step": 7109, + "time_per_iteration": 3.27755069732666 + }, + { + "auxiliary_loss_clip": 0.01118058, + "auxiliary_loss_mlp": 0.01024274, + "balance_loss_clip": 1.04166985, + "balance_loss_mlp": 1.01730919, + "epoch": 0.8549269524439368, + "flos": 24717853532160.0, + "grad_norm": 2.3224754343060345, + "language_loss": 0.7424742, + "learning_rate": 2.1660978967561784e-07, + "loss": 0.76389754, + "num_input_tokens_seen": 153904760, + "step": 7110, + "time_per_iteration": 3.447495698928833 + }, + { + "auxiliary_loss_clip": 0.01163556, + "auxiliary_loss_mlp": 0.0102038, + "balance_loss_clip": 1.04498148, + "balance_loss_mlp": 1.01380217, + "epoch": 0.8550471953345758, + "flos": 19825599191040.0, + "grad_norm": 2.7017082195453903, + "language_loss": 0.78816509, + "learning_rate": 2.1625733392098035e-07, + "loss": 0.81000435, + "num_input_tokens_seen": 153920370, + "step": 7111, + "time_per_iteration": 2.401700735092163 + }, + { + "auxiliary_loss_clip": 0.01164089, + "auxiliary_loss_mlp": 0.01023094, + "balance_loss_clip": 1.04617691, + "balance_loss_mlp": 1.01623917, + "epoch": 0.8551674382252149, + "flos": 22820441500800.0, + "grad_norm": 1.583238807419085, + "language_loss": 0.79560065, + "learning_rate": 2.159051487586867e-07, + "loss": 0.81747246, + "num_input_tokens_seen": 153940500, + "step": 7112, + "time_per_iteration": 2.4194259643554688 + }, + { + "auxiliary_loss_clip": 0.01141066, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.04665303, + "balance_loss_mlp": 1.02133965, + "epoch": 0.8552876811158541, + "flos": 20631255292800.0, + "grad_norm": 2.010515579972332, + "language_loss": 0.72758937, + "learning_rate": 2.155532342421642e-07, + "loss": 0.74929285, + "num_input_tokens_seen": 153958500, + "step": 7113, + "time_per_iteration": 2.4786014556884766 + }, + { + "auxiliary_loss_clip": 0.01154349, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.04639828, + "balance_loss_mlp": 1.02130246, + "epoch": 0.8554079240064931, + "flos": 23112359331840.0, + "grad_norm": 1.7578985421022326, + "language_loss": 0.78201163, + "learning_rate": 2.1520159042479636e-07, + "loss": 0.80384278, + "num_input_tokens_seen": 153976790, + "step": 7114, + "time_per_iteration": 2.466628074645996 + }, + { + "auxiliary_loss_clip": 0.01150476, + "auxiliary_loss_mlp": 0.01024776, + "balance_loss_clip": 1.04641366, + "balance_loss_mlp": 1.01762962, + "epoch": 0.8555281668971322, + "flos": 22128047959680.0, + "grad_norm": 2.1415884181085025, + "language_loss": 0.71078467, + "learning_rate": 2.148502173599287e-07, + "loss": 0.73253727, + "num_input_tokens_seen": 153994930, + "step": 7115, + "time_per_iteration": 2.4356510639190674 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01019869, + "balance_loss_clip": 1.04307628, + "balance_loss_mlp": 1.01238227, + "epoch": 0.8556484097877713, + "flos": 31139040234240.0, + "grad_norm": 1.7688118373270425, + "language_loss": 0.66214544, + "learning_rate": 2.1449911510086372e-07, + "loss": 0.68365264, + "num_input_tokens_seen": 154014400, + "step": 7116, + "time_per_iteration": 3.298696517944336 + }, + { + "auxiliary_loss_clip": 0.01147216, + "auxiliary_loss_mlp": 0.0102453, + "balance_loss_clip": 1.04423118, + "balance_loss_mlp": 1.01752079, + "epoch": 0.8557686526784104, + "flos": 24316551809280.0, + "grad_norm": 1.752646639779243, + "language_loss": 0.76703835, + "learning_rate": 2.141482837008628e-07, + "loss": 0.78875583, + "num_input_tokens_seen": 154034940, + "step": 7117, + "time_per_iteration": 2.4745707511901855 + }, + { + "auxiliary_loss_clip": 0.01141461, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.04265523, + "balance_loss_mlp": 1.02091861, + "epoch": 0.8558888955690495, + "flos": 17712723427200.0, + "grad_norm": 1.915362085385774, + "language_loss": 0.71950197, + "learning_rate": 2.1379772321314826e-07, + "loss": 0.74120009, + "num_input_tokens_seen": 154052985, + "step": 7118, + "time_per_iteration": 2.4002320766448975 + }, + { + "auxiliary_loss_clip": 0.01088184, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.04164338, + "balance_loss_mlp": 1.01889861, + "epoch": 0.8560091384596886, + "flos": 19171702051200.0, + "grad_norm": 1.9415124180322658, + "language_loss": 0.81642765, + "learning_rate": 2.1344743369089802e-07, + "loss": 0.83757353, + "num_input_tokens_seen": 154068765, + "step": 7119, + "time_per_iteration": 2.5673701763153076 + }, + { + "auxiliary_loss_clip": 0.01138132, + "auxiliary_loss_mlp": 0.01024128, + "balance_loss_clip": 1.0464741, + "balance_loss_mlp": 1.01710999, + "epoch": 0.8561293813503277, + "flos": 23914855036800.0, + "grad_norm": 1.6660649511231194, + "language_loss": 0.81984925, + "learning_rate": 2.130974151872522e-07, + "loss": 0.84147185, + "num_input_tokens_seen": 154089100, + "step": 7120, + "time_per_iteration": 2.5017502307891846 + }, + { + "auxiliary_loss_clip": 0.01127324, + "auxiliary_loss_mlp": 0.01023549, + "balance_loss_clip": 1.04591632, + "balance_loss_mlp": 1.01660848, + "epoch": 0.8562496242409667, + "flos": 22529206028160.0, + "grad_norm": 1.6972027605980051, + "language_loss": 0.78405559, + "learning_rate": 2.1274766775530773e-07, + "loss": 0.80556428, + "num_input_tokens_seen": 154108965, + "step": 7121, + "time_per_iteration": 2.5228111743927 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01021753, + "balance_loss_clip": 1.04575682, + "balance_loss_mlp": 1.01455534, + "epoch": 0.8563698671316058, + "flos": 14712745472640.0, + "grad_norm": 2.262499353113867, + "language_loss": 0.79953611, + "learning_rate": 2.1239819144812077e-07, + "loss": 0.8214159, + "num_input_tokens_seen": 154123425, + "step": 7122, + "time_per_iteration": 2.382805109024048 + }, + { + "auxiliary_loss_clip": 0.01115698, + "auxiliary_loss_mlp": 0.01024639, + "balance_loss_clip": 1.03922987, + "balance_loss_mlp": 1.01757264, + "epoch": 0.856490110022245, + "flos": 39167768211840.0, + "grad_norm": 1.7684929227127204, + "language_loss": 0.70028448, + "learning_rate": 2.1204898631870716e-07, + "loss": 0.72168791, + "num_input_tokens_seen": 154148315, + "step": 7123, + "time_per_iteration": 2.654613733291626 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01022538, + "balance_loss_clip": 1.04618323, + "balance_loss_mlp": 1.01592505, + "epoch": 0.856610352912884, + "flos": 29059345658880.0, + "grad_norm": 1.713443754030315, + "language_loss": 0.75897026, + "learning_rate": 2.1170005242004006e-07, + "loss": 0.78057152, + "num_input_tokens_seen": 154169665, + "step": 7124, + "time_per_iteration": 2.5396406650543213 + }, + { + "auxiliary_loss_clip": 0.0114077, + "auxiliary_loss_mlp": 0.01020307, + "balance_loss_clip": 1.04329169, + "balance_loss_mlp": 1.01335979, + "epoch": 0.8567305958035231, + "flos": 23878333883520.0, + "grad_norm": 1.6401032456238034, + "language_loss": 0.77739376, + "learning_rate": 2.1135138980505384e-07, + "loss": 0.7990045, + "num_input_tokens_seen": 154190335, + "step": 7125, + "time_per_iteration": 2.5006911754608154 + }, + { + "auxiliary_loss_clip": 0.01133988, + "auxiliary_loss_mlp": 0.01021823, + "balance_loss_clip": 1.04508853, + "balance_loss_mlp": 1.01464081, + "epoch": 0.8568508386941622, + "flos": 22200120599040.0, + "grad_norm": 1.6923256701763048, + "language_loss": 0.7226361, + "learning_rate": 2.110029985266395e-07, + "loss": 0.74419427, + "num_input_tokens_seen": 154210040, + "step": 7126, + "time_per_iteration": 2.481630563735962 + }, + { + "auxiliary_loss_clip": 0.01139962, + "auxiliary_loss_mlp": 0.01023347, + "balance_loss_clip": 1.04299104, + "balance_loss_mlp": 1.01628399, + "epoch": 0.8569710815848013, + "flos": 17307507121920.0, + "grad_norm": 1.717440676243043, + "language_loss": 0.73825502, + "learning_rate": 2.1065487863764787e-07, + "loss": 0.75988805, + "num_input_tokens_seen": 154228385, + "step": 7127, + "time_per_iteration": 2.4848124980926514 + }, + { + "auxiliary_loss_clip": 0.01099867, + "auxiliary_loss_mlp": 0.01022333, + "balance_loss_clip": 1.03663266, + "balance_loss_mlp": 1.01476645, + "epoch": 0.8570913244754403, + "flos": 23732285184000.0, + "grad_norm": 1.5067486900678617, + "language_loss": 0.85707498, + "learning_rate": 2.1030703019088846e-07, + "loss": 0.87829697, + "num_input_tokens_seen": 154249015, + "step": 7128, + "time_per_iteration": 2.5588924884796143 + }, + { + "auxiliary_loss_clip": 0.01144073, + "auxiliary_loss_mlp": 0.01022251, + "balance_loss_clip": 1.04383111, + "balance_loss_mlp": 1.01531649, + "epoch": 0.8572115673660795, + "flos": 20048748433920.0, + "grad_norm": 1.8404305668588965, + "language_loss": 0.71009517, + "learning_rate": 2.099594532391291e-07, + "loss": 0.73175842, + "num_input_tokens_seen": 154267700, + "step": 7129, + "time_per_iteration": 2.447481870651245 + }, + { + "auxiliary_loss_clip": 0.01141461, + "auxiliary_loss_mlp": 0.01022812, + "balance_loss_clip": 1.04364192, + "balance_loss_mlp": 1.01587701, + "epoch": 0.8573318102567186, + "flos": 27160389342720.0, + "grad_norm": 2.135642708660653, + "language_loss": 0.79491234, + "learning_rate": 2.0961214783509806e-07, + "loss": 0.81655508, + "num_input_tokens_seen": 154290580, + "step": 7130, + "time_per_iteration": 2.4881370067596436 + }, + { + "auxiliary_loss_clip": 0.01141851, + "auxiliary_loss_mlp": 0.01022728, + "balance_loss_clip": 1.04389179, + "balance_loss_mlp": 1.01544392, + "epoch": 0.8574520531473576, + "flos": 24936585402240.0, + "grad_norm": 1.672699755340515, + "language_loss": 0.74431634, + "learning_rate": 2.0926511403148051e-07, + "loss": 0.76596206, + "num_input_tokens_seen": 154309545, + "step": 7131, + "time_per_iteration": 2.5279552936553955 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.04535735, + "balance_loss_mlp": 1.02132261, + "epoch": 0.8575722960379968, + "flos": 18771154513920.0, + "grad_norm": 2.010697116662194, + "language_loss": 0.75646871, + "learning_rate": 2.0891835188092143e-07, + "loss": 0.77806562, + "num_input_tokens_seen": 154326545, + "step": 7132, + "time_per_iteration": 2.485581398010254 + }, + { + "auxiliary_loss_clip": 0.01130087, + "auxiliary_loss_mlp": 0.0102527, + "balance_loss_clip": 1.04269528, + "balance_loss_mlp": 1.01760507, + "epoch": 0.8576925389286358, + "flos": 22200300167040.0, + "grad_norm": 1.785602718310032, + "language_loss": 0.81554157, + "learning_rate": 2.0857186143602434e-07, + "loss": 0.83709514, + "num_input_tokens_seen": 154345190, + "step": 7133, + "time_per_iteration": 2.5050699710845947 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.03923154, + "balance_loss_mlp": 1.02162695, + "epoch": 0.8578127818192749, + "flos": 22894345733760.0, + "grad_norm": 2.153421500470876, + "language_loss": 0.67821896, + "learning_rate": 2.0822564274935094e-07, + "loss": 0.69963348, + "num_input_tokens_seen": 154364615, + "step": 7134, + "time_per_iteration": 3.3863205909729004 + }, + { + "auxiliary_loss_clip": 0.01138747, + "auxiliary_loss_mlp": 0.01024078, + "balance_loss_clip": 1.04717696, + "balance_loss_mlp": 1.01645803, + "epoch": 0.8579330247099141, + "flos": 34824839541120.0, + "grad_norm": 1.6992535868343805, + "language_loss": 0.67037719, + "learning_rate": 2.078796958734239e-07, + "loss": 0.69200546, + "num_input_tokens_seen": 154387335, + "step": 7135, + "time_per_iteration": 3.3488194942474365 + }, + { + "auxiliary_loss_clip": 0.01150362, + "auxiliary_loss_mlp": 0.01025291, + "balance_loss_clip": 1.04596567, + "balance_loss_mlp": 1.01869583, + "epoch": 0.8580532676005531, + "flos": 19755681367680.0, + "grad_norm": 1.810673972587797, + "language_loss": 0.75111914, + "learning_rate": 2.0753402086072124e-07, + "loss": 0.77287567, + "num_input_tokens_seen": 154405965, + "step": 7136, + "time_per_iteration": 2.4711809158325195 + }, + { + "auxiliary_loss_clip": 0.01092402, + "auxiliary_loss_mlp": 0.01028217, + "balance_loss_clip": 1.04108095, + "balance_loss_mlp": 1.02101052, + "epoch": 0.8581735104911922, + "flos": 22739318634240.0, + "grad_norm": 3.4370197576638586, + "language_loss": 0.75543499, + "learning_rate": 2.071886177636828e-07, + "loss": 0.77664119, + "num_input_tokens_seen": 154422750, + "step": 7137, + "time_per_iteration": 3.4663712978363037 + }, + { + "auxiliary_loss_clip": 0.01148035, + "auxiliary_loss_mlp": 0.01022027, + "balance_loss_clip": 1.04576111, + "balance_loss_mlp": 1.01519358, + "epoch": 0.8582937533818313, + "flos": 23149131880320.0, + "grad_norm": 1.8716892172047015, + "language_loss": 0.83404833, + "learning_rate": 2.0684348663470575e-07, + "loss": 0.85574889, + "num_input_tokens_seen": 154442930, + "step": 7138, + "time_per_iteration": 2.4559977054595947 + }, + { + "auxiliary_loss_clip": 0.01133601, + "auxiliary_loss_mlp": 0.01025404, + "balance_loss_clip": 1.03944218, + "balance_loss_mlp": 1.01821876, + "epoch": 0.8584139962724704, + "flos": 19498668577920.0, + "grad_norm": 1.9761921902968498, + "language_loss": 0.61640847, + "learning_rate": 2.0649862752614555e-07, + "loss": 0.63799858, + "num_input_tokens_seen": 154461640, + "step": 7139, + "time_per_iteration": 2.475545644760132 + }, + { + "auxiliary_loss_clip": 0.01042598, + "auxiliary_loss_mlp": 0.01000632, + "balance_loss_clip": 1.00727153, + "balance_loss_mlp": 0.9997142, + "epoch": 0.8585342391631094, + "flos": 71276577788160.0, + "grad_norm": 0.7948786302469087, + "language_loss": 0.57112753, + "learning_rate": 2.0615404049031838e-07, + "loss": 0.59155977, + "num_input_tokens_seen": 154518610, + "step": 7140, + "time_per_iteration": 3.0986568927764893 + }, + { + "auxiliary_loss_clip": 0.01152329, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.04647374, + "balance_loss_mlp": 1.01862538, + "epoch": 0.8586544820537486, + "flos": 10815432929280.0, + "grad_norm": 2.54544364075567, + "language_loss": 0.77928972, + "learning_rate": 2.0580972557949616e-07, + "loss": 0.8010757, + "num_input_tokens_seen": 154533700, + "step": 7141, + "time_per_iteration": 2.4081368446350098 + }, + { + "auxiliary_loss_clip": 0.01053219, + "auxiliary_loss_mlp": 0.01000548, + "balance_loss_clip": 1.0076679, + "balance_loss_mlp": 0.99956435, + "epoch": 0.8587747249443877, + "flos": 64811184422400.0, + "grad_norm": 0.7943313617794318, + "language_loss": 0.54253036, + "learning_rate": 2.054656828459125e-07, + "loss": 0.56306803, + "num_input_tokens_seen": 154597810, + "step": 7142, + "time_per_iteration": 3.787107229232788 + }, + { + "auxiliary_loss_clip": 0.01105922, + "auxiliary_loss_mlp": 0.01025544, + "balance_loss_clip": 1.04046488, + "balance_loss_mlp": 1.01786041, + "epoch": 0.8588949678350267, + "flos": 26834607964800.0, + "grad_norm": 1.792004687903429, + "language_loss": 0.77316689, + "learning_rate": 2.051219123417578e-07, + "loss": 0.79448158, + "num_input_tokens_seen": 154617870, + "step": 7143, + "time_per_iteration": 2.601811170578003 + }, + { + "auxiliary_loss_clip": 0.01166115, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.04571009, + "balance_loss_mlp": 1.01630831, + "epoch": 0.8590152107256659, + "flos": 26104256726400.0, + "grad_norm": 2.120210973560474, + "language_loss": 0.59869969, + "learning_rate": 2.0477841411918196e-07, + "loss": 0.62060302, + "num_input_tokens_seen": 154637395, + "step": 7144, + "time_per_iteration": 2.4668514728546143 + }, + { + "auxiliary_loss_clip": 0.01144708, + "auxiliary_loss_mlp": 0.01022133, + "balance_loss_clip": 1.04402995, + "balance_loss_mlp": 1.01509368, + "epoch": 0.859135453616305, + "flos": 26140885620480.0, + "grad_norm": 1.8277466197563197, + "language_loss": 0.75026977, + "learning_rate": 2.0443518823029326e-07, + "loss": 0.7719382, + "num_input_tokens_seen": 154657935, + "step": 7145, + "time_per_iteration": 2.522007465362549 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01027138, + "balance_loss_clip": 1.04153407, + "balance_loss_mlp": 1.0197289, + "epoch": 0.859255696506944, + "flos": 12969319046400.0, + "grad_norm": 2.0723992810253353, + "language_loss": 0.76774371, + "learning_rate": 2.0409223472715854e-07, + "loss": 0.78918958, + "num_input_tokens_seen": 154675080, + "step": 7146, + "time_per_iteration": 2.488037586212158 + }, + { + "auxiliary_loss_clip": 0.01124392, + "auxiliary_loss_mlp": 0.00761389, + "balance_loss_clip": 1.04345751, + "balance_loss_mlp": 1.00045025, + "epoch": 0.8593759393975832, + "flos": 18475753063680.0, + "grad_norm": 1.8579247381037083, + "language_loss": 0.75002623, + "learning_rate": 2.0374955366180434e-07, + "loss": 0.76888394, + "num_input_tokens_seen": 154692720, + "step": 7147, + "time_per_iteration": 2.5174143314361572 + }, + { + "auxiliary_loss_clip": 0.01126578, + "auxiliary_loss_mlp": 0.01021481, + "balance_loss_clip": 1.04173207, + "balance_loss_mlp": 1.01446247, + "epoch": 0.8594961822882222, + "flos": 22200156512640.0, + "grad_norm": 1.6797549329276067, + "language_loss": 0.72642541, + "learning_rate": 2.034071450862147e-07, + "loss": 0.74790597, + "num_input_tokens_seen": 154710190, + "step": 7148, + "time_per_iteration": 2.5408291816711426 + }, + { + "auxiliary_loss_clip": 0.01139129, + "auxiliary_loss_mlp": 0.01023014, + "balance_loss_clip": 1.04301882, + "balance_loss_mlp": 1.015432, + "epoch": 0.8596164251788613, + "flos": 23294749616640.0, + "grad_norm": 1.8180564129992038, + "language_loss": 0.76850498, + "learning_rate": 2.030650090523327e-07, + "loss": 0.79012644, + "num_input_tokens_seen": 154729380, + "step": 7149, + "time_per_iteration": 2.5200419425964355 + }, + { + "auxiliary_loss_clip": 0.01120077, + "auxiliary_loss_mlp": 0.01023761, + "balance_loss_clip": 1.04114318, + "balance_loss_mlp": 1.01627517, + "epoch": 0.8597366680695004, + "flos": 31649905416960.0, + "grad_norm": 1.7057502844911314, + "language_loss": 0.59345913, + "learning_rate": 2.0272314561205995e-07, + "loss": 0.61489749, + "num_input_tokens_seen": 154749775, + "step": 7150, + "time_per_iteration": 2.5961899757385254 + }, + { + "auxiliary_loss_clip": 0.01115746, + "auxiliary_loss_mlp": 0.01019331, + "balance_loss_clip": 1.03978515, + "balance_loss_mlp": 1.01245558, + "epoch": 0.8598569109601395, + "flos": 21287738211840.0, + "grad_norm": 2.2297157319955887, + "language_loss": 0.72871649, + "learning_rate": 2.023815548172567e-07, + "loss": 0.75006723, + "num_input_tokens_seen": 154769845, + "step": 7151, + "time_per_iteration": 2.5622732639312744 + }, + { + "auxiliary_loss_clip": 0.01150406, + "auxiliary_loss_mlp": 0.01023706, + "balance_loss_clip": 1.04482269, + "balance_loss_mlp": 1.01644015, + "epoch": 0.8599771538507786, + "flos": 25447809720960.0, + "grad_norm": 1.6101062331573892, + "language_loss": 0.6585561, + "learning_rate": 2.0204023671974267e-07, + "loss": 0.68029726, + "num_input_tokens_seen": 154789230, + "step": 7152, + "time_per_iteration": 2.4884986877441406 + }, + { + "auxiliary_loss_clip": 0.01143532, + "auxiliary_loss_mlp": 0.01023821, + "balance_loss_clip": 1.04296422, + "balance_loss_mlp": 1.01630533, + "epoch": 0.8600973967414177, + "flos": 16723958768640.0, + "grad_norm": 2.0342596018549015, + "language_loss": 0.81103659, + "learning_rate": 2.0169919137129532e-07, + "loss": 0.83271015, + "num_input_tokens_seen": 154807670, + "step": 7153, + "time_per_iteration": 2.436661720275879 + }, + { + "auxiliary_loss_clip": 0.01151922, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.04653025, + "balance_loss_mlp": 1.01864386, + "epoch": 0.8602176396320568, + "flos": 25227928615680.0, + "grad_norm": 3.207644420848581, + "language_loss": 0.70785671, + "learning_rate": 2.013584188236508e-07, + "loss": 0.72963732, + "num_input_tokens_seen": 154825575, + "step": 7154, + "time_per_iteration": 2.4701919555664062 + }, + { + "auxiliary_loss_clip": 0.01166557, + "auxiliary_loss_mlp": 0.01024054, + "balance_loss_clip": 1.04713452, + "balance_loss_mlp": 1.0166328, + "epoch": 0.8603378825226958, + "flos": 20412236113920.0, + "grad_norm": 1.8171104162711655, + "language_loss": 0.7927283, + "learning_rate": 2.0101791912850396e-07, + "loss": 0.81463444, + "num_input_tokens_seen": 154845115, + "step": 7155, + "time_per_iteration": 2.4896609783172607 + }, + { + "auxiliary_loss_clip": 0.01136658, + "auxiliary_loss_mlp": 0.01018456, + "balance_loss_clip": 1.04521883, + "balance_loss_mlp": 1.01131868, + "epoch": 0.8604581254133349, + "flos": 34930201109760.0, + "grad_norm": 2.06850491880919, + "language_loss": 0.64094603, + "learning_rate": 2.006776923375082e-07, + "loss": 0.66249716, + "num_input_tokens_seen": 154866770, + "step": 7156, + "time_per_iteration": 2.5888118743896484 + }, + { + "auxiliary_loss_clip": 0.01164964, + "auxiliary_loss_mlp": 0.01020246, + "balance_loss_clip": 1.04633975, + "balance_loss_mlp": 1.01316762, + "epoch": 0.860578368303974, + "flos": 22596538072320.0, + "grad_norm": 1.615905795665432, + "language_loss": 0.71076477, + "learning_rate": 2.003377385022764e-07, + "loss": 0.7326169, + "num_input_tokens_seen": 154885595, + "step": 7157, + "time_per_iteration": 2.442112445831299 + }, + { + "auxiliary_loss_clip": 0.01136359, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.04194939, + "balance_loss_mlp": 1.01952505, + "epoch": 0.8606986111946131, + "flos": 21324331192320.0, + "grad_norm": 2.7986868252528163, + "language_loss": 0.77131236, + "learning_rate": 1.9999805767437826e-07, + "loss": 0.7929374, + "num_input_tokens_seen": 154904485, + "step": 7158, + "time_per_iteration": 2.4537353515625 + }, + { + "auxiliary_loss_clip": 0.01130876, + "auxiliary_loss_mlp": 0.01022342, + "balance_loss_clip": 1.04168081, + "balance_loss_mlp": 1.01549327, + "epoch": 0.8608188540852522, + "flos": 28877206769280.0, + "grad_norm": 1.6227244744477838, + "language_loss": 0.71838284, + "learning_rate": 1.9965864990534386e-07, + "loss": 0.73991501, + "num_input_tokens_seen": 154925010, + "step": 7159, + "time_per_iteration": 2.5383620262145996 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01022494, + "balance_loss_clip": 1.03851306, + "balance_loss_mlp": 1.01589882, + "epoch": 0.8609390969758913, + "flos": 29716187713920.0, + "grad_norm": 1.7403032928859041, + "language_loss": 0.77594042, + "learning_rate": 1.9931951524666092e-07, + "loss": 0.79731339, + "num_input_tokens_seen": 154946100, + "step": 7160, + "time_per_iteration": 2.657500743865967 + }, + { + "auxiliary_loss_clip": 0.01153075, + "auxiliary_loss_mlp": 0.00761612, + "balance_loss_clip": 1.04609132, + "balance_loss_mlp": 1.00046122, + "epoch": 0.8610593398665304, + "flos": 21249349551360.0, + "grad_norm": 1.6085437579817292, + "language_loss": 0.81116873, + "learning_rate": 1.9898065374977534e-07, + "loss": 0.83031559, + "num_input_tokens_seen": 154966305, + "step": 7161, + "time_per_iteration": 3.298006772994995 + }, + { + "auxiliary_loss_clip": 0.01118675, + "auxiliary_loss_mlp": 0.01018596, + "balance_loss_clip": 1.04061389, + "balance_loss_mlp": 1.01259685, + "epoch": 0.8611795827571694, + "flos": 14830102183680.0, + "grad_norm": 2.334776530974076, + "language_loss": 0.72395498, + "learning_rate": 1.9864206546609342e-07, + "loss": 0.74532765, + "num_input_tokens_seen": 154985145, + "step": 7162, + "time_per_iteration": 2.4654252529144287 + }, + { + "auxiliary_loss_clip": 0.01162847, + "auxiliary_loss_mlp": 0.01021165, + "balance_loss_clip": 1.04496765, + "balance_loss_mlp": 1.01436126, + "epoch": 0.8612998256478086, + "flos": 24243258107520.0, + "grad_norm": 1.7584485460681074, + "language_loss": 0.84370685, + "learning_rate": 1.983037504469771e-07, + "loss": 0.86554694, + "num_input_tokens_seen": 155003855, + "step": 7163, + "time_per_iteration": 2.439310312271118 + }, + { + "auxiliary_loss_clip": 0.01154268, + "auxiliary_loss_mlp": 0.01023178, + "balance_loss_clip": 1.04794371, + "balance_loss_mlp": 1.01624954, + "epoch": 0.8614200685384477, + "flos": 21252653602560.0, + "grad_norm": 1.7550456793430336, + "language_loss": 0.6641385, + "learning_rate": 1.9796570874374984e-07, + "loss": 0.68591297, + "num_input_tokens_seen": 155023960, + "step": 7164, + "time_per_iteration": 3.2776477336883545 + }, + { + "auxiliary_loss_clip": 0.01138227, + "auxiliary_loss_mlp": 0.01019308, + "balance_loss_clip": 1.04431295, + "balance_loss_mlp": 1.01208401, + "epoch": 0.8615403114290867, + "flos": 20007738080640.0, + "grad_norm": 1.9202443777637868, + "language_loss": 0.77560365, + "learning_rate": 1.976279404076917e-07, + "loss": 0.79717898, + "num_input_tokens_seen": 155043360, + "step": 7165, + "time_per_iteration": 2.489232301712036 + }, + { + "auxiliary_loss_clip": 0.01121536, + "auxiliary_loss_mlp": 0.01024967, + "balance_loss_clip": 1.04301095, + "balance_loss_mlp": 1.01811802, + "epoch": 0.8616605543197259, + "flos": 29789373674880.0, + "grad_norm": 2.3160453414399793, + "language_loss": 0.76121736, + "learning_rate": 1.9729044549004193e-07, + "loss": 0.78268236, + "num_input_tokens_seen": 155064745, + "step": 7166, + "time_per_iteration": 2.5777416229248047 + }, + { + "auxiliary_loss_clip": 0.01149603, + "auxiliary_loss_mlp": 0.01021178, + "balance_loss_clip": 1.04632354, + "balance_loss_mlp": 1.01432967, + "epoch": 0.8617807972103649, + "flos": 28911609020160.0, + "grad_norm": 2.392063385774023, + "language_loss": 0.70326626, + "learning_rate": 1.9695322404199822e-07, + "loss": 0.72497404, + "num_input_tokens_seen": 155086790, + "step": 7167, + "time_per_iteration": 2.4902288913726807 + }, + { + "auxiliary_loss_clip": 0.01139383, + "auxiliary_loss_mlp": 0.01025531, + "balance_loss_clip": 1.04743254, + "balance_loss_mlp": 1.01831341, + "epoch": 0.861901040101004, + "flos": 27673804391040.0, + "grad_norm": 2.8428791164739757, + "language_loss": 0.82057333, + "learning_rate": 1.9661627611471654e-07, + "loss": 0.84222245, + "num_input_tokens_seen": 155106585, + "step": 7168, + "time_per_iteration": 2.53017520904541 + }, + { + "auxiliary_loss_clip": 0.01143684, + "auxiliary_loss_mlp": 0.01021376, + "balance_loss_clip": 1.04487944, + "balance_loss_mlp": 1.01368058, + "epoch": 0.8620212829916432, + "flos": 49748056755840.0, + "grad_norm": 1.9479253952305635, + "language_loss": 0.7032724, + "learning_rate": 1.9627960175931246e-07, + "loss": 0.72492301, + "num_input_tokens_seen": 155131285, + "step": 7169, + "time_per_iteration": 3.5001347064971924 + }, + { + "auxiliary_loss_clip": 0.01150959, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.04737639, + "balance_loss_mlp": 1.02125502, + "epoch": 0.8621415258822822, + "flos": 21138672769920.0, + "grad_norm": 1.9796452057515024, + "language_loss": 0.74033833, + "learning_rate": 1.9594320102685847e-07, + "loss": 0.7621268, + "num_input_tokens_seen": 155150555, + "step": 7170, + "time_per_iteration": 2.4481654167175293 + }, + { + "auxiliary_loss_clip": 0.01125506, + "auxiliary_loss_mlp": 0.00761345, + "balance_loss_clip": 1.04096925, + "balance_loss_mlp": 1.00041819, + "epoch": 0.8622617687729213, + "flos": 21689039934720.0, + "grad_norm": 1.8623388644017276, + "language_loss": 0.63945937, + "learning_rate": 1.956070739683864e-07, + "loss": 0.65832794, + "num_input_tokens_seen": 155169890, + "step": 7171, + "time_per_iteration": 2.482297897338867 + }, + { + "auxiliary_loss_clip": 0.01107715, + "auxiliary_loss_mlp": 0.01020904, + "balance_loss_clip": 1.03871894, + "balance_loss_mlp": 1.01387656, + "epoch": 0.8623820116635604, + "flos": 26250592734720.0, + "grad_norm": 1.4883773715103235, + "language_loss": 0.74448526, + "learning_rate": 1.9527122063488678e-07, + "loss": 0.76577145, + "num_input_tokens_seen": 155191005, + "step": 7172, + "time_per_iteration": 2.5603301525115967 + }, + { + "auxiliary_loss_clip": 0.01132689, + "auxiliary_loss_mlp": 0.01019137, + "balance_loss_clip": 1.03947949, + "balance_loss_mlp": 1.01248479, + "epoch": 0.8625022545541995, + "flos": 19647554451840.0, + "grad_norm": 1.7552696607859688, + "language_loss": 0.8015672, + "learning_rate": 1.9493564107730755e-07, + "loss": 0.82308543, + "num_input_tokens_seen": 155211005, + "step": 7173, + "time_per_iteration": 2.484396457672119 + }, + { + "auxiliary_loss_clip": 0.01130122, + "auxiliary_loss_mlp": 0.01024789, + "balance_loss_clip": 1.04005027, + "balance_loss_mlp": 1.01823866, + "epoch": 0.8626224974448385, + "flos": 21908382336000.0, + "grad_norm": 1.8993551317961008, + "language_loss": 0.61120141, + "learning_rate": 1.9460033534655684e-07, + "loss": 0.63275051, + "num_input_tokens_seen": 155230365, + "step": 7174, + "time_per_iteration": 2.513669967651367 + }, + { + "auxiliary_loss_clip": 0.011275, + "auxiliary_loss_mlp": 0.01024181, + "balance_loss_clip": 1.03730571, + "balance_loss_mlp": 1.01709414, + "epoch": 0.8627427403354777, + "flos": 23331198942720.0, + "grad_norm": 2.01607008177014, + "language_loss": 0.84329581, + "learning_rate": 1.9426530349349978e-07, + "loss": 0.86481261, + "num_input_tokens_seen": 155250815, + "step": 7175, + "time_per_iteration": 2.4828202724456787 + }, + { + "auxiliary_loss_clip": 0.01148406, + "auxiliary_loss_mlp": 0.00761486, + "balance_loss_clip": 1.04410505, + "balance_loss_mlp": 1.00044036, + "epoch": 0.8628629832261168, + "flos": 16362877299840.0, + "grad_norm": 1.8052870818472062, + "language_loss": 0.64746886, + "learning_rate": 1.9393054556896038e-07, + "loss": 0.6665678, + "num_input_tokens_seen": 155268515, + "step": 7176, + "time_per_iteration": 2.498678684234619 + }, + { + "auxiliary_loss_clip": 0.01118835, + "auxiliary_loss_mlp": 0.01023025, + "balance_loss_clip": 1.03963065, + "balance_loss_mlp": 1.01558614, + "epoch": 0.8629832261167558, + "flos": 28103941756800.0, + "grad_norm": 2.2220566490366784, + "language_loss": 0.69286215, + "learning_rate": 1.9359606162372133e-07, + "loss": 0.71428072, + "num_input_tokens_seen": 155290120, + "step": 7177, + "time_per_iteration": 2.6455132961273193 + }, + { + "auxiliary_loss_clip": 0.01163659, + "auxiliary_loss_mlp": 0.01021851, + "balance_loss_clip": 1.0466615, + "balance_loss_mlp": 1.01476979, + "epoch": 0.863103469007395, + "flos": 20230061310720.0, + "grad_norm": 1.6731357986143505, + "language_loss": 0.70456815, + "learning_rate": 1.9326185170852293e-07, + "loss": 0.72642326, + "num_input_tokens_seen": 155309085, + "step": 7178, + "time_per_iteration": 2.419935941696167 + }, + { + "auxiliary_loss_clip": 0.01148719, + "auxiliary_loss_mlp": 0.01022166, + "balance_loss_clip": 1.0447228, + "balance_loss_mlp": 1.01529956, + "epoch": 0.863223711898034, + "flos": 24498547044480.0, + "grad_norm": 2.2492280640774993, + "language_loss": 0.72275078, + "learning_rate": 1.9292791587406598e-07, + "loss": 0.74445969, + "num_input_tokens_seen": 155327945, + "step": 7179, + "time_per_iteration": 2.484071969985962 + }, + { + "auxiliary_loss_clip": 0.01148161, + "auxiliary_loss_mlp": 0.00761599, + "balance_loss_clip": 1.04366326, + "balance_loss_mlp": 1.00045872, + "epoch": 0.8633439547886731, + "flos": 17675376261120.0, + "grad_norm": 1.9709774704506708, + "language_loss": 0.86926478, + "learning_rate": 1.9259425417100661e-07, + "loss": 0.88836235, + "num_input_tokens_seen": 155344060, + "step": 7180, + "time_per_iteration": 2.4398000240325928 + }, + { + "auxiliary_loss_clip": 0.01089203, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.03364253, + "balance_loss_mlp": 1.02115881, + "epoch": 0.8634641976793123, + "flos": 12895055677440.0, + "grad_norm": 2.273902149310325, + "language_loss": 0.75118923, + "learning_rate": 1.9226086664996234e-07, + "loss": 0.7723676, + "num_input_tokens_seen": 155362305, + "step": 7181, + "time_per_iteration": 2.5962767601013184 + }, + { + "auxiliary_loss_clip": 0.01140575, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.04629219, + "balance_loss_mlp": 1.01902294, + "epoch": 0.8635844405699513, + "flos": 23878980328320.0, + "grad_norm": 2.1145054036243787, + "language_loss": 0.74558616, + "learning_rate": 1.9192775336150712e-07, + "loss": 0.76724958, + "num_input_tokens_seen": 155382605, + "step": 7182, + "time_per_iteration": 2.522078037261963 + }, + { + "auxiliary_loss_clip": 0.01047765, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 1.00723147, + "balance_loss_mlp": 1.00045192, + "epoch": 0.8637046834605904, + "flos": 60453387521280.0, + "grad_norm": 0.7791210614873482, + "language_loss": 0.56297916, + "learning_rate": 1.915949143561739e-07, + "loss": 0.58347094, + "num_input_tokens_seen": 155437280, + "step": 7183, + "time_per_iteration": 3.0353150367736816 + }, + { + "auxiliary_loss_clip": 0.01150454, + "auxiliary_loss_mlp": 0.01025263, + "balance_loss_clip": 1.04654932, + "balance_loss_mlp": 1.01849771, + "epoch": 0.8638249263512295, + "flos": 20558751690240.0, + "grad_norm": 1.7818436930383539, + "language_loss": 0.78101188, + "learning_rate": 1.9126234968445498e-07, + "loss": 0.80276906, + "num_input_tokens_seen": 155456970, + "step": 7184, + "time_per_iteration": 2.4756267070770264 + }, + { + "auxiliary_loss_clip": 0.01166765, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.04843652, + "balance_loss_mlp": 1.02172065, + "epoch": 0.8639451692418686, + "flos": 26615768353920.0, + "grad_norm": 1.4424492467860675, + "language_loss": 0.67526162, + "learning_rate": 1.9093005939679884e-07, + "loss": 0.69721973, + "num_input_tokens_seen": 155478925, + "step": 7185, + "time_per_iteration": 2.46785044670105 + }, + { + "auxiliary_loss_clip": 0.01151267, + "auxiliary_loss_mlp": 0.01027334, + "balance_loss_clip": 1.04599619, + "balance_loss_mlp": 1.01983833, + "epoch": 0.8640654121325076, + "flos": 15122450977920.0, + "grad_norm": 1.7826463810661388, + "language_loss": 0.76235586, + "learning_rate": 1.9059804354361452e-07, + "loss": 0.78414184, + "num_input_tokens_seen": 155496700, + "step": 7186, + "time_per_iteration": 2.3923046588897705 + }, + { + "auxiliary_loss_clip": 0.0112752, + "auxiliary_loss_mlp": 0.01022135, + "balance_loss_clip": 1.03850758, + "balance_loss_mlp": 1.01474404, + "epoch": 0.8641856550231467, + "flos": 31869068250240.0, + "grad_norm": 2.7666134160528597, + "language_loss": 0.70189577, + "learning_rate": 1.902663021752684e-07, + "loss": 0.72339237, + "num_input_tokens_seen": 155518130, + "step": 7187, + "time_per_iteration": 2.5652294158935547 + }, + { + "auxiliary_loss_clip": 0.01167919, + "auxiliary_loss_mlp": 0.0102202, + "balance_loss_clip": 1.04863822, + "balance_loss_mlp": 1.01492715, + "epoch": 0.8643058979137859, + "flos": 14976545932800.0, + "grad_norm": 2.0883023640443947, + "language_loss": 0.82624209, + "learning_rate": 1.8993483534208556e-07, + "loss": 0.84814155, + "num_input_tokens_seen": 155537040, + "step": 7188, + "time_per_iteration": 3.165694236755371 + }, + { + "auxiliary_loss_clip": 0.01132931, + "auxiliary_loss_mlp": 0.01025959, + "balance_loss_clip": 1.04535341, + "balance_loss_mlp": 1.0183475, + "epoch": 0.8644261408044249, + "flos": 13115726881920.0, + "grad_norm": 2.3793782609293195, + "language_loss": 0.74976712, + "learning_rate": 1.8960364309434884e-07, + "loss": 0.77135599, + "num_input_tokens_seen": 155554535, + "step": 7189, + "time_per_iteration": 3.3004629611968994 + }, + { + "auxiliary_loss_clip": 0.01089953, + "auxiliary_loss_mlp": 0.00761578, + "balance_loss_clip": 1.03765702, + "balance_loss_mlp": 1.00045216, + "epoch": 0.864546383695064, + "flos": 20850920916480.0, + "grad_norm": 1.6633452739898058, + "language_loss": 0.78486216, + "learning_rate": 1.8927272548229967e-07, + "loss": 0.80337751, + "num_input_tokens_seen": 155574225, + "step": 7190, + "time_per_iteration": 3.425333261489868 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.01024414, + "balance_loss_clip": 1.04110312, + "balance_loss_mlp": 1.01752329, + "epoch": 0.8646666265857031, + "flos": 21324582587520.0, + "grad_norm": 1.5625766720170955, + "language_loss": 0.83032167, + "learning_rate": 1.8894208255613876e-07, + "loss": 0.85165632, + "num_input_tokens_seen": 155593540, + "step": 7191, + "time_per_iteration": 2.5528724193573 + }, + { + "auxiliary_loss_clip": 0.01164949, + "auxiliary_loss_mlp": 0.01021209, + "balance_loss_clip": 1.04750764, + "balance_loss_mlp": 1.01422906, + "epoch": 0.8647868694763422, + "flos": 19750833031680.0, + "grad_norm": 2.0039131677404303, + "language_loss": 0.77587777, + "learning_rate": 1.8861171436602397e-07, + "loss": 0.79773939, + "num_input_tokens_seen": 155610655, + "step": 7192, + "time_per_iteration": 2.401355028152466 + }, + { + "auxiliary_loss_clip": 0.01153267, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.0469681, + "balance_loss_mlp": 1.01959729, + "epoch": 0.8649071123669813, + "flos": 26176760328960.0, + "grad_norm": 2.2980797467456395, + "language_loss": 0.80351007, + "learning_rate": 1.882816209620719e-07, + "loss": 0.82531208, + "num_input_tokens_seen": 155627365, + "step": 7193, + "time_per_iteration": 2.4719905853271484 + }, + { + "auxiliary_loss_clip": 0.01143419, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.05009413, + "balance_loss_mlp": 1.0185132, + "epoch": 0.8650273552576204, + "flos": 20302888135680.0, + "grad_norm": 4.460382097870268, + "language_loss": 0.76593405, + "learning_rate": 1.8795180239435738e-07, + "loss": 0.78762901, + "num_input_tokens_seen": 155646220, + "step": 7194, + "time_per_iteration": 2.4722025394439697 + }, + { + "auxiliary_loss_clip": 0.01143013, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.045964, + "balance_loss_mlp": 1.01846695, + "epoch": 0.8651475981482595, + "flos": 23951088881280.0, + "grad_norm": 3.0105802304119966, + "language_loss": 0.7641834, + "learning_rate": 1.8762225871291348e-07, + "loss": 0.78587282, + "num_input_tokens_seen": 155662095, + "step": 7195, + "time_per_iteration": 2.491901397705078 + }, + { + "auxiliary_loss_clip": 0.01165321, + "auxiliary_loss_mlp": 0.0076158, + "balance_loss_clip": 1.0470469, + "balance_loss_mlp": 1.00047779, + "epoch": 0.8652678410388985, + "flos": 21684622561920.0, + "grad_norm": 1.664628418272885, + "language_loss": 0.80965567, + "learning_rate": 1.8729298996773201e-07, + "loss": 0.82892466, + "num_input_tokens_seen": 155680845, + "step": 7196, + "time_per_iteration": 3.188472270965576 + }, + { + "auxiliary_loss_clip": 0.01047082, + "auxiliary_loss_mlp": 0.01002488, + "balance_loss_clip": 1.00778794, + "balance_loss_mlp": 1.00148618, + "epoch": 0.8653880839295377, + "flos": 65224660855680.0, + "grad_norm": 0.8355544559823973, + "language_loss": 0.61004162, + "learning_rate": 1.8696399620876301e-07, + "loss": 0.63053727, + "num_input_tokens_seen": 155737875, + "step": 7197, + "time_per_iteration": 2.9900219440460205 + }, + { + "auxiliary_loss_clip": 0.01120454, + "auxiliary_loss_mlp": 0.01024241, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.01658797, + "epoch": 0.8655083268201768, + "flos": 17749172753280.0, + "grad_norm": 2.3259976094731494, + "language_loss": 0.78793573, + "learning_rate": 1.866352774859141e-07, + "loss": 0.80938268, + "num_input_tokens_seen": 155753100, + "step": 7198, + "time_per_iteration": 2.471815824508667 + }, + { + "auxiliary_loss_clip": 0.0112523, + "auxiliary_loss_mlp": 0.01022292, + "balance_loss_clip": 1.0403744, + "balance_loss_mlp": 1.01574779, + "epoch": 0.8656285697108158, + "flos": 20703974376960.0, + "grad_norm": 2.6370103477383484, + "language_loss": 0.69603854, + "learning_rate": 1.8630683384905188e-07, + "loss": 0.71751374, + "num_input_tokens_seen": 155772430, + "step": 7199, + "time_per_iteration": 2.489495277404785 + }, + { + "auxiliary_loss_clip": 0.01167, + "auxiliary_loss_mlp": 0.00761697, + "balance_loss_clip": 1.04912877, + "balance_loss_mlp": 1.00046921, + "epoch": 0.865748812601455, + "flos": 18653833716480.0, + "grad_norm": 2.1157043708059677, + "language_loss": 0.88626057, + "learning_rate": 1.8597866534800045e-07, + "loss": 0.90554756, + "num_input_tokens_seen": 155787545, + "step": 7200, + "time_per_iteration": 2.3899731636047363 + }, + { + "auxiliary_loss_clip": 0.01153518, + "auxiliary_loss_mlp": 0.00762037, + "balance_loss_clip": 1.04650259, + "balance_loss_mlp": 1.00039446, + "epoch": 0.865869055492094, + "flos": 70652554807680.0, + "grad_norm": 2.4030041371908037, + "language_loss": 0.74556565, + "learning_rate": 1.8565077203254398e-07, + "loss": 0.76472116, + "num_input_tokens_seen": 155813005, + "step": 7201, + "time_per_iteration": 2.8548152446746826 + }, + { + "auxiliary_loss_clip": 0.01125996, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.04730594, + "balance_loss_mlp": 1.0201304, + "epoch": 0.8659892983827331, + "flos": 17383961220480.0, + "grad_norm": 2.8187071375143, + "language_loss": 0.72664392, + "learning_rate": 1.8532315395242203e-07, + "loss": 0.74817753, + "num_input_tokens_seen": 155829455, + "step": 7202, + "time_per_iteration": 2.478963851928711 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01021681, + "balance_loss_clip": 1.04116428, + "balance_loss_mlp": 1.01486802, + "epoch": 0.8661095412733723, + "flos": 17895221452800.0, + "grad_norm": 2.187291378597507, + "language_loss": 0.7256676, + "learning_rate": 1.849958111573353e-07, + "loss": 0.74711818, + "num_input_tokens_seen": 155848060, + "step": 7203, + "time_per_iteration": 2.4842655658721924 + }, + { + "auxiliary_loss_clip": 0.01162012, + "auxiliary_loss_mlp": 0.01021479, + "balance_loss_clip": 1.0460043, + "balance_loss_mlp": 1.01482391, + "epoch": 0.8662297841640113, + "flos": 18224163227520.0, + "grad_norm": 1.69114508267875, + "language_loss": 0.64192033, + "learning_rate": 1.8466874369694074e-07, + "loss": 0.66375524, + "num_input_tokens_seen": 155865755, + "step": 7204, + "time_per_iteration": 2.408764362335205 + }, + { + "auxiliary_loss_clip": 0.01123255, + "auxiliary_loss_mlp": 0.01023464, + "balance_loss_clip": 1.03958607, + "balance_loss_mlp": 1.01678586, + "epoch": 0.8663500270546504, + "flos": 16362159027840.0, + "grad_norm": 2.5639042246858006, + "language_loss": 0.7051791, + "learning_rate": 1.843419516208542e-07, + "loss": 0.7266463, + "num_input_tokens_seen": 155882680, + "step": 7205, + "time_per_iteration": 2.4768755435943604 + }, + { + "auxiliary_loss_clip": 0.01153133, + "auxiliary_loss_mlp": 0.01023827, + "balance_loss_clip": 1.04743052, + "balance_loss_mlp": 1.01622105, + "epoch": 0.8664702699452895, + "flos": 17894431353600.0, + "grad_norm": 2.30049604041738, + "language_loss": 0.79530966, + "learning_rate": 1.8401543497865047e-07, + "loss": 0.81707931, + "num_input_tokens_seen": 155900680, + "step": 7206, + "time_per_iteration": 2.4018285274505615 + }, + { + "auxiliary_loss_clip": 0.0115393, + "auxiliary_loss_mlp": 0.00761717, + "balance_loss_clip": 1.0466845, + "balance_loss_mlp": 1.00041735, + "epoch": 0.8665905128359286, + "flos": 30736373794560.0, + "grad_norm": 1.8444418785065984, + "language_loss": 0.64331782, + "learning_rate": 1.836891938198608e-07, + "loss": 0.66247427, + "num_input_tokens_seen": 155921105, + "step": 7207, + "time_per_iteration": 2.519784450531006 + }, + { + "auxiliary_loss_clip": 0.01135419, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.04418874, + "balance_loss_mlp": 1.01985538, + "epoch": 0.8667107557265676, + "flos": 18656419495680.0, + "grad_norm": 2.112179523624969, + "language_loss": 0.71469545, + "learning_rate": 1.8336322819397677e-07, + "loss": 0.73631871, + "num_input_tokens_seen": 155938640, + "step": 7208, + "time_per_iteration": 2.4382028579711914 + }, + { + "auxiliary_loss_clip": 0.01127471, + "auxiliary_loss_mlp": 0.01024617, + "balance_loss_clip": 1.04024673, + "balance_loss_mlp": 1.01731586, + "epoch": 0.8668309986172068, + "flos": 20083725302400.0, + "grad_norm": 3.634784215260924, + "language_loss": 0.6269623, + "learning_rate": 1.8303753815044654e-07, + "loss": 0.64848322, + "num_input_tokens_seen": 155957945, + "step": 7209, + "time_per_iteration": 2.5275588035583496 + }, + { + "auxiliary_loss_clip": 0.01143566, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.04285336, + "balance_loss_mlp": 1.01640725, + "epoch": 0.8669512415078459, + "flos": 21615099788160.0, + "grad_norm": 21.689983554009228, + "language_loss": 0.70910311, + "learning_rate": 1.827121237386773e-07, + "loss": 0.73077774, + "num_input_tokens_seen": 155975390, + "step": 7210, + "time_per_iteration": 2.5133891105651855 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01026775, + "balance_loss_clip": 1.04371071, + "balance_loss_mlp": 1.01966405, + "epoch": 0.8670714843984849, + "flos": 17703601372800.0, + "grad_norm": 2.5703796765308438, + "language_loss": 0.74971163, + "learning_rate": 1.8238698500803374e-07, + "loss": 0.77136779, + "num_input_tokens_seen": 155988155, + "step": 7211, + "time_per_iteration": 2.404639482498169 + }, + { + "auxiliary_loss_clip": 0.01052786, + "auxiliary_loss_mlp": 0.01001665, + "balance_loss_clip": 1.00803161, + "balance_loss_mlp": 1.00067604, + "epoch": 0.8671917272891241, + "flos": 60705483125760.0, + "grad_norm": 0.7177639008424767, + "language_loss": 0.56275952, + "learning_rate": 1.820621220078391e-07, + "loss": 0.58330399, + "num_input_tokens_seen": 156052065, + "step": 7212, + "time_per_iteration": 3.0944032669067383 + }, + { + "auxiliary_loss_clip": 0.01163049, + "auxiliary_loss_mlp": 0.01018756, + "balance_loss_clip": 1.04521549, + "balance_loss_mlp": 1.01137066, + "epoch": 0.8673119701797631, + "flos": 20451881750400.0, + "grad_norm": 1.6343438347074417, + "language_loss": 0.67810798, + "learning_rate": 1.8173753478737553e-07, + "loss": 0.69992602, + "num_input_tokens_seen": 156072500, + "step": 7213, + "time_per_iteration": 2.422515392303467 + }, + { + "auxiliary_loss_clip": 0.01168366, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.04770541, + "balance_loss_mlp": 1.01931643, + "epoch": 0.8674322130704022, + "flos": 19647410797440.0, + "grad_norm": 2.129531267401255, + "language_loss": 0.79784894, + "learning_rate": 1.8141322339588205e-07, + "loss": 0.81979847, + "num_input_tokens_seen": 156089840, + "step": 7214, + "time_per_iteration": 2.3884010314941406 + }, + { + "auxiliary_loss_clip": 0.01165535, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.04775238, + "balance_loss_mlp": 1.01889634, + "epoch": 0.8675524559610414, + "flos": 26025001367040.0, + "grad_norm": 1.9739554735825269, + "language_loss": 0.70256305, + "learning_rate": 1.810891878825569e-07, + "loss": 0.72447836, + "num_input_tokens_seen": 156109815, + "step": 7215, + "time_per_iteration": 3.151298999786377 + }, + { + "auxiliary_loss_clip": 0.01132954, + "auxiliary_loss_mlp": 0.0102297, + "balance_loss_clip": 1.04019213, + "balance_loss_mlp": 1.01597226, + "epoch": 0.8676726988516804, + "flos": 15049444584960.0, + "grad_norm": 2.3792088920886827, + "language_loss": 0.7182318, + "learning_rate": 1.8076542829655561e-07, + "loss": 0.73979104, + "num_input_tokens_seen": 156128620, + "step": 7216, + "time_per_iteration": 3.5483880043029785 + }, + { + "auxiliary_loss_clip": 0.01140362, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.0467881, + "balance_loss_mlp": 1.01904202, + "epoch": 0.8677929417423195, + "flos": 16288111140480.0, + "grad_norm": 1.8380978494496327, + "language_loss": 0.79279149, + "learning_rate": 1.8044194468699203e-07, + "loss": 0.81446517, + "num_input_tokens_seen": 156145930, + "step": 7217, + "time_per_iteration": 3.3012497425079346 + }, + { + "auxiliary_loss_clip": 0.01136375, + "auxiliary_loss_mlp": 0.0102343, + "balance_loss_clip": 1.04667044, + "balance_loss_mlp": 1.01642644, + "epoch": 0.8679131846329585, + "flos": 18844160906880.0, + "grad_norm": 2.6265625464221776, + "language_loss": 0.76021826, + "learning_rate": 1.8011873710293912e-07, + "loss": 0.7818163, + "num_input_tokens_seen": 156164435, + "step": 7218, + "time_per_iteration": 2.518362522125244 + }, + { + "auxiliary_loss_clip": 0.01147029, + "auxiliary_loss_mlp": 0.01023731, + "balance_loss_clip": 1.0453763, + "balance_loss_mlp": 1.01647723, + "epoch": 0.8680334275235977, + "flos": 33620718890880.0, + "grad_norm": 1.7668753068412386, + "language_loss": 0.69502074, + "learning_rate": 1.7979580559342677e-07, + "loss": 0.71672833, + "num_input_tokens_seen": 156185165, + "step": 7219, + "time_per_iteration": 2.539588451385498 + }, + { + "auxiliary_loss_clip": 0.0113732, + "auxiliary_loss_mlp": 0.01024839, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.01791036, + "epoch": 0.8681536704142367, + "flos": 24681152810880.0, + "grad_norm": 1.661279573481478, + "language_loss": 0.66628891, + "learning_rate": 1.7947315020744358e-07, + "loss": 0.6879105, + "num_input_tokens_seen": 156206260, + "step": 7220, + "time_per_iteration": 2.5168042182922363 + }, + { + "auxiliary_loss_clip": 0.01133822, + "auxiliary_loss_mlp": 0.01020396, + "balance_loss_clip": 1.04179478, + "balance_loss_mlp": 1.01354706, + "epoch": 0.8682739133048758, + "flos": 20011042131840.0, + "grad_norm": 1.7472550506304882, + "language_loss": 0.8028605, + "learning_rate": 1.7915077099393594e-07, + "loss": 0.82440269, + "num_input_tokens_seen": 156222860, + "step": 7221, + "time_per_iteration": 2.4560141563415527 + }, + { + "auxiliary_loss_clip": 0.01153419, + "auxiliary_loss_mlp": 0.01025319, + "balance_loss_clip": 1.04447746, + "balance_loss_mlp": 1.01802301, + "epoch": 0.868394156195515, + "flos": 16654759217280.0, + "grad_norm": 1.8490439489410155, + "language_loss": 0.73337972, + "learning_rate": 1.788286680018083e-07, + "loss": 0.75516713, + "num_input_tokens_seen": 156241570, + "step": 7222, + "time_per_iteration": 2.440154552459717 + }, + { + "auxiliary_loss_clip": 0.01141224, + "auxiliary_loss_mlp": 0.01025568, + "balance_loss_clip": 1.04470885, + "balance_loss_mlp": 1.01887417, + "epoch": 0.868514399086154, + "flos": 28001381448960.0, + "grad_norm": 1.8230106415532108, + "language_loss": 0.72670174, + "learning_rate": 1.7850684127992443e-07, + "loss": 0.74836957, + "num_input_tokens_seen": 156261315, + "step": 7223, + "time_per_iteration": 3.2229413986206055 + }, + { + "auxiliary_loss_clip": 0.01123506, + "auxiliary_loss_mlp": 0.01024615, + "balance_loss_clip": 1.04491425, + "balance_loss_mlp": 1.0175817, + "epoch": 0.8686346419767931, + "flos": 20084587228800.0, + "grad_norm": 1.735764597549995, + "language_loss": 0.70108593, + "learning_rate": 1.7818529087710378e-07, + "loss": 0.72256708, + "num_input_tokens_seen": 156281670, + "step": 7224, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.01147513, + "auxiliary_loss_mlp": 0.00761875, + "balance_loss_clip": 1.04344988, + "balance_loss_mlp": 1.00043964, + "epoch": 0.8687548848674322, + "flos": 18223516782720.0, + "grad_norm": 1.6537983379543508, + "language_loss": 0.84135675, + "learning_rate": 1.7786401684212637e-07, + "loss": 0.86045063, + "num_input_tokens_seen": 156300500, + "step": 7225, + "time_per_iteration": 2.427837610244751 + }, + { + "auxiliary_loss_clip": 0.01030253, + "auxiliary_loss_mlp": 0.01003754, + "balance_loss_clip": 1.01050448, + "balance_loss_mlp": 1.00300622, + "epoch": 0.8688751277580713, + "flos": 70457885049600.0, + "grad_norm": 0.7448156500959605, + "language_loss": 0.55952418, + "learning_rate": 1.7754301922372883e-07, + "loss": 0.57986426, + "num_input_tokens_seen": 156350145, + "step": 7226, + "time_per_iteration": 2.960059881210327 + }, + { + "auxiliary_loss_clip": 0.01101348, + "auxiliary_loss_mlp": 0.01023442, + "balance_loss_clip": 1.04107022, + "balance_loss_mlp": 1.01597977, + "epoch": 0.8689953706487104, + "flos": 26906788344960.0, + "grad_norm": 1.9821852930127382, + "language_loss": 0.80675328, + "learning_rate": 1.7722229807060617e-07, + "loss": 0.82800114, + "num_input_tokens_seen": 156368725, + "step": 7227, + "time_per_iteration": 2.626340389251709 + }, + { + "auxiliary_loss_clip": 0.01113914, + "auxiliary_loss_mlp": 0.01023528, + "balance_loss_clip": 1.03940463, + "balance_loss_mlp": 1.01679015, + "epoch": 0.8691156135393495, + "flos": 34637385438720.0, + "grad_norm": 2.6854853968846255, + "language_loss": 0.82057846, + "learning_rate": 1.7690185343141172e-07, + "loss": 0.84195292, + "num_input_tokens_seen": 156388640, + "step": 7228, + "time_per_iteration": 2.6248438358306885 + }, + { + "auxiliary_loss_clip": 0.01136533, + "auxiliary_loss_mlp": 0.01020992, + "balance_loss_clip": 1.04294658, + "balance_loss_mlp": 1.01432788, + "epoch": 0.8692358564299886, + "flos": 18989814556800.0, + "grad_norm": 2.2716592416477717, + "language_loss": 0.69993812, + "learning_rate": 1.7658168535475615e-07, + "loss": 0.72151327, + "num_input_tokens_seen": 156406425, + "step": 7229, + "time_per_iteration": 2.4600648880004883 + }, + { + "auxiliary_loss_clip": 0.0114353, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.04729962, + "balance_loss_mlp": 1.0191381, + "epoch": 0.8693560993206276, + "flos": 30370839039360.0, + "grad_norm": 1.6102992224663388, + "language_loss": 0.64323294, + "learning_rate": 1.7626179388920948e-07, + "loss": 0.66493136, + "num_input_tokens_seen": 156427705, + "step": 7230, + "time_per_iteration": 2.569403886795044 + }, + { + "auxiliary_loss_clip": 0.01137886, + "auxiliary_loss_mlp": 0.00761504, + "balance_loss_clip": 1.04577625, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8694763422112668, + "flos": 27200430028800.0, + "grad_norm": 1.6627130175782878, + "language_loss": 0.80660558, + "learning_rate": 1.7594217908329866e-07, + "loss": 0.82559949, + "num_input_tokens_seen": 156449890, + "step": 7231, + "time_per_iteration": 2.549978494644165 + }, + { + "auxiliary_loss_clip": 0.01129133, + "auxiliary_loss_mlp": 0.01020288, + "balance_loss_clip": 1.04360998, + "balance_loss_mlp": 1.01343369, + "epoch": 0.8695965851019059, + "flos": 26139161767680.0, + "grad_norm": 2.1040868894728995, + "language_loss": 0.74063492, + "learning_rate": 1.7562284098550895e-07, + "loss": 0.76212919, + "num_input_tokens_seen": 156469600, + "step": 7232, + "time_per_iteration": 2.504659652709961 + }, + { + "auxiliary_loss_clip": 0.01036728, + "auxiliary_loss_mlp": 0.01001842, + "balance_loss_clip": 1.00906968, + "balance_loss_mlp": 1.00084066, + "epoch": 0.8697168279925449, + "flos": 67332616456320.0, + "grad_norm": 0.8363226528400701, + "language_loss": 0.62288058, + "learning_rate": 1.753037796442838e-07, + "loss": 0.64326626, + "num_input_tokens_seen": 156529040, + "step": 7233, + "time_per_iteration": 3.0336718559265137 + }, + { + "auxiliary_loss_clip": 0.0116483, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_clip": 1.04627991, + "balance_loss_mlp": 1.0163815, + "epoch": 0.8698370708831841, + "flos": 19718693337600.0, + "grad_norm": 2.1924795882386596, + "language_loss": 0.75254858, + "learning_rate": 1.74984995108024e-07, + "loss": 0.77443564, + "num_input_tokens_seen": 156546970, + "step": 7234, + "time_per_iteration": 2.4014999866485596 + }, + { + "auxiliary_loss_clip": 0.01152445, + "auxiliary_loss_mlp": 0.01020582, + "balance_loss_clip": 1.04538691, + "balance_loss_mlp": 1.01382589, + "epoch": 0.8699573137738231, + "flos": 12859971068160.0, + "grad_norm": 2.0490979757867835, + "language_loss": 0.83430582, + "learning_rate": 1.7466648742508981e-07, + "loss": 0.85603607, + "num_input_tokens_seen": 156563155, + "step": 7235, + "time_per_iteration": 2.3954050540924072 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.04438365, + "balance_loss_mlp": 1.02059102, + "epoch": 0.8700775566644622, + "flos": 17420733768960.0, + "grad_norm": 1.8509984896470082, + "language_loss": 0.84744728, + "learning_rate": 1.7434825664379837e-07, + "loss": 0.86907488, + "num_input_tokens_seen": 156581660, + "step": 7236, + "time_per_iteration": 2.4437103271484375 + }, + { + "auxiliary_loss_clip": 0.01152251, + "auxiliary_loss_mlp": 0.01020869, + "balance_loss_clip": 1.04583621, + "balance_loss_mlp": 1.01381803, + "epoch": 0.8701977995551013, + "flos": 13735221770880.0, + "grad_norm": 2.472246871219972, + "language_loss": 0.86005932, + "learning_rate": 1.740303028124246e-07, + "loss": 0.88179052, + "num_input_tokens_seen": 156597720, + "step": 7237, + "time_per_iteration": 2.4325177669525146 + }, + { + "auxiliary_loss_clip": 0.01087302, + "auxiliary_loss_mlp": 0.01022647, + "balance_loss_clip": 1.03820395, + "balance_loss_mlp": 1.01557159, + "epoch": 0.8703180424457404, + "flos": 30555707362560.0, + "grad_norm": 2.1474210848133737, + "language_loss": 0.75847697, + "learning_rate": 1.7371262597920212e-07, + "loss": 0.77957654, + "num_input_tokens_seen": 156619780, + "step": 7238, + "time_per_iteration": 2.6330466270446777 + }, + { + "auxiliary_loss_clip": 0.01110035, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.04637206, + "balance_loss_mlp": 1.02401686, + "epoch": 0.8704382853363795, + "flos": 19608986223360.0, + "grad_norm": 1.4507670531626606, + "language_loss": 0.76313674, + "learning_rate": 1.7339522619232195e-07, + "loss": 0.784549, + "num_input_tokens_seen": 156638160, + "step": 7239, + "time_per_iteration": 2.5429558753967285 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01025196, + "balance_loss_clip": 1.04311478, + "balance_loss_mlp": 1.01756704, + "epoch": 0.8705585282270186, + "flos": 26613900846720.0, + "grad_norm": 1.8145665293701223, + "language_loss": 0.7527864, + "learning_rate": 1.730781034999338e-07, + "loss": 0.77447844, + "num_input_tokens_seen": 156659740, + "step": 7240, + "time_per_iteration": 2.5225019454956055 + }, + { + "auxiliary_loss_clip": 0.01162477, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.04853749, + "balance_loss_mlp": 1.01852775, + "epoch": 0.8706787711176577, + "flos": 34090465979520.0, + "grad_norm": 2.189082264317166, + "language_loss": 0.7326014, + "learning_rate": 1.7276125795014497e-07, + "loss": 0.75447965, + "num_input_tokens_seen": 156678190, + "step": 7241, + "time_per_iteration": 3.258859872817993 + }, + { + "auxiliary_loss_clip": 0.0113959, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.01709104, + "epoch": 0.8707990140082967, + "flos": 14611513968000.0, + "grad_norm": 1.8861881373261027, + "language_loss": 0.67529809, + "learning_rate": 1.7244468959102054e-07, + "loss": 0.69693953, + "num_input_tokens_seen": 156695245, + "step": 7242, + "time_per_iteration": 3.2692439556121826 + }, + { + "auxiliary_loss_clip": 0.01152093, + "auxiliary_loss_mlp": 0.0102106, + "balance_loss_clip": 1.04796839, + "balance_loss_mlp": 1.01383018, + "epoch": 0.8709192568989359, + "flos": 20084156265600.0, + "grad_norm": 2.162124046544784, + "language_loss": 0.85106719, + "learning_rate": 1.7212839847058348e-07, + "loss": 0.87279868, + "num_input_tokens_seen": 156710375, + "step": 7243, + "time_per_iteration": 3.285090923309326 + }, + { + "auxiliary_loss_clip": 0.01102517, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.03975677, + "balance_loss_mlp": 1.01538289, + "epoch": 0.871039499789575, + "flos": 16727083251840.0, + "grad_norm": 1.9652953171757017, + "language_loss": 0.73743653, + "learning_rate": 1.718123846368147e-07, + "loss": 0.7586813, + "num_input_tokens_seen": 156729420, + "step": 7244, + "time_per_iteration": 2.563595771789551 + }, + { + "auxiliary_loss_clip": 0.01137063, + "auxiliary_loss_mlp": 0.0076149, + "balance_loss_clip": 1.04633033, + "balance_loss_mlp": 1.00042486, + "epoch": 0.871159742680214, + "flos": 21068790860160.0, + "grad_norm": 1.6732765310763076, + "language_loss": 0.71722603, + "learning_rate": 1.714966481376543e-07, + "loss": 0.7362116, + "num_input_tokens_seen": 156746100, + "step": 7245, + "time_per_iteration": 2.4658870697021484 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01023633, + "balance_loss_clip": 1.04388595, + "balance_loss_mlp": 1.0166893, + "epoch": 0.8712799855708532, + "flos": 28256526731520.0, + "grad_norm": 1.9632740234594193, + "language_loss": 0.83133149, + "learning_rate": 1.7118118902099797e-07, + "loss": 0.8530606, + "num_input_tokens_seen": 156764185, + "step": 7246, + "time_per_iteration": 2.474360466003418 + }, + { + "auxiliary_loss_clip": 0.01150039, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.04477, + "balance_loss_mlp": 1.02224934, + "epoch": 0.8714002284614922, + "flos": 22236677665920.0, + "grad_norm": 1.5796608125500102, + "language_loss": 0.8056609, + "learning_rate": 1.7086600733470146e-07, + "loss": 0.82745409, + "num_input_tokens_seen": 156784855, + "step": 7247, + "time_per_iteration": 2.445566177368164 + }, + { + "auxiliary_loss_clip": 0.01147125, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.04464507, + "balance_loss_mlp": 1.02038074, + "epoch": 0.8715204713521313, + "flos": 21431919404160.0, + "grad_norm": 1.935767532876687, + "language_loss": 0.77335012, + "learning_rate": 1.7055110312657738e-07, + "loss": 0.79509139, + "num_input_tokens_seen": 156804350, + "step": 7248, + "time_per_iteration": 2.4370718002319336 + }, + { + "auxiliary_loss_clip": 0.01132677, + "auxiliary_loss_mlp": 0.01027483, + "balance_loss_clip": 1.04434276, + "balance_loss_mlp": 1.01993704, + "epoch": 0.8716407142427703, + "flos": 23440439180160.0, + "grad_norm": 2.5153054770464633, + "language_loss": 0.74408627, + "learning_rate": 1.702364764443962e-07, + "loss": 0.76568788, + "num_input_tokens_seen": 156823425, + "step": 7249, + "time_per_iteration": 3.2174181938171387 + }, + { + "auxiliary_loss_clip": 0.01089067, + "auxiliary_loss_mlp": 0.01021527, + "balance_loss_clip": 1.03563094, + "balance_loss_mlp": 1.0139842, + "epoch": 0.8717609571334095, + "flos": 27958683156480.0, + "grad_norm": 1.8951449523988524, + "language_loss": 0.72447848, + "learning_rate": 1.6992212733588685e-07, + "loss": 0.74558449, + "num_input_tokens_seen": 156843090, + "step": 7250, + "time_per_iteration": 2.640659809112549 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.01024201, + "balance_loss_clip": 1.04252803, + "balance_loss_mlp": 1.01718819, + "epoch": 0.8718812000240486, + "flos": 25479482538240.0, + "grad_norm": 1.7649682355255958, + "language_loss": 0.75195658, + "learning_rate": 1.6960805584873538e-07, + "loss": 0.77353281, + "num_input_tokens_seen": 156861090, + "step": 7251, + "time_per_iteration": 2.500795841217041 + }, + { + "auxiliary_loss_clip": 0.01111381, + "auxiliary_loss_mlp": 0.01025281, + "balance_loss_clip": 1.04045248, + "balance_loss_mlp": 1.01842391, + "epoch": 0.8720014429146876, + "flos": 23403056100480.0, + "grad_norm": 2.812243738197062, + "language_loss": 0.78263247, + "learning_rate": 1.6929426203058684e-07, + "loss": 0.80399907, + "num_input_tokens_seen": 156881515, + "step": 7252, + "time_per_iteration": 2.5852882862091064 + }, + { + "auxiliary_loss_clip": 0.01167568, + "auxiliary_loss_mlp": 0.00762262, + "balance_loss_clip": 1.04532886, + "balance_loss_mlp": 1.00051188, + "epoch": 0.8721216858053268, + "flos": 24352821567360.0, + "grad_norm": 2.770436258071441, + "language_loss": 0.80376977, + "learning_rate": 1.689807459290431e-07, + "loss": 0.82306808, + "num_input_tokens_seen": 156900170, + "step": 7253, + "time_per_iteration": 2.434401512145996 + }, + { + "auxiliary_loss_clip": 0.01137395, + "auxiliary_loss_mlp": 0.01023688, + "balance_loss_clip": 1.04412496, + "balance_loss_mlp": 1.01732874, + "epoch": 0.8722419286959658, + "flos": 33869687034240.0, + "grad_norm": 1.9828134672737794, + "language_loss": 0.70552158, + "learning_rate": 1.6866750759166437e-07, + "loss": 0.72713244, + "num_input_tokens_seen": 156920150, + "step": 7254, + "time_per_iteration": 2.590894937515259 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.01021353, + "balance_loss_clip": 1.03890204, + "balance_loss_mlp": 1.0138483, + "epoch": 0.8723621715866049, + "flos": 18369385914240.0, + "grad_norm": 2.325752208452468, + "language_loss": 0.77237034, + "learning_rate": 1.6835454706596865e-07, + "loss": 0.79375571, + "num_input_tokens_seen": 156937980, + "step": 7255, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01165945, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.04769075, + "balance_loss_mlp": 1.01935589, + "epoch": 0.8724824144772441, + "flos": 22013348855040.0, + "grad_norm": 2.354655660311847, + "language_loss": 0.737064, + "learning_rate": 1.680418643994317e-07, + "loss": 0.75898921, + "num_input_tokens_seen": 156956550, + "step": 7256, + "time_per_iteration": 2.4169082641601562 + }, + { + "auxiliary_loss_clip": 0.01062112, + "auxiliary_loss_mlp": 0.01001034, + "balance_loss_clip": 1.00829339, + "balance_loss_mlp": 1.00016332, + "epoch": 0.8726026573678831, + "flos": 66698720213760.0, + "grad_norm": 0.8921880064192208, + "language_loss": 0.64547169, + "learning_rate": 1.6772945963948738e-07, + "loss": 0.66610312, + "num_input_tokens_seen": 157014715, + "step": 7257, + "time_per_iteration": 3.026306629180908 + }, + { + "auxiliary_loss_clip": 0.01131867, + "auxiliary_loss_mlp": 0.01023775, + "balance_loss_clip": 1.04413319, + "balance_loss_mlp": 1.01699543, + "epoch": 0.8727229002585222, + "flos": 13370908078080.0, + "grad_norm": 2.9417976147240394, + "language_loss": 0.77364326, + "learning_rate": 1.6741733283352733e-07, + "loss": 0.79519969, + "num_input_tokens_seen": 157032320, + "step": 7258, + "time_per_iteration": 2.4624507427215576 + }, + { + "auxiliary_loss_clip": 0.01116296, + "auxiliary_loss_mlp": 0.01026885, + "balance_loss_clip": 1.0433048, + "balance_loss_mlp": 1.01999807, + "epoch": 0.8728431431491613, + "flos": 21796987282560.0, + "grad_norm": 1.5145087344391084, + "language_loss": 0.8384738, + "learning_rate": 1.6710548402890102e-07, + "loss": 0.8599056, + "num_input_tokens_seen": 157052845, + "step": 7259, + "time_per_iteration": 2.553818941116333 + }, + { + "auxiliary_loss_clip": 0.01168785, + "auxiliary_loss_mlp": 0.01024988, + "balance_loss_clip": 1.04685569, + "balance_loss_mlp": 1.01740909, + "epoch": 0.8729633860398004, + "flos": 36173823742080.0, + "grad_norm": 1.8175940698179587, + "language_loss": 0.66873902, + "learning_rate": 1.6679391327291527e-07, + "loss": 0.69067675, + "num_input_tokens_seen": 157074050, + "step": 7260, + "time_per_iteration": 2.5198378562927246 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.01023822, + "balance_loss_clip": 1.04127288, + "balance_loss_mlp": 1.01706612, + "epoch": 0.8730836289304394, + "flos": 16359680989440.0, + "grad_norm": 4.247101649933086, + "language_loss": 0.67918015, + "learning_rate": 1.6648262061283492e-07, + "loss": 0.70077455, + "num_input_tokens_seen": 157089350, + "step": 7261, + "time_per_iteration": 2.42807936668396 + }, + { + "auxiliary_loss_clip": 0.01121677, + "auxiliary_loss_mlp": 0.01024106, + "balance_loss_clip": 1.03930736, + "balance_loss_mlp": 1.01723695, + "epoch": 0.8732038718210786, + "flos": 21215126868480.0, + "grad_norm": 1.877826568790639, + "language_loss": 0.73299396, + "learning_rate": 1.6617160609588353e-07, + "loss": 0.75445175, + "num_input_tokens_seen": 157108525, + "step": 7262, + "time_per_iteration": 2.4914398193359375 + }, + { + "auxiliary_loss_clip": 0.01142879, + "auxiliary_loss_mlp": 0.01027586, + "balance_loss_clip": 1.04569161, + "balance_loss_mlp": 1.01995111, + "epoch": 0.8733241147117177, + "flos": 16610696208000.0, + "grad_norm": 4.5376051386047065, + "language_loss": 0.71651846, + "learning_rate": 1.6586086976924163e-07, + "loss": 0.73822308, + "num_input_tokens_seen": 157124025, + "step": 7263, + "time_per_iteration": 2.4247589111328125 + }, + { + "auxiliary_loss_clip": 0.01150642, + "auxiliary_loss_mlp": 0.01021705, + "balance_loss_clip": 1.04396796, + "balance_loss_mlp": 1.01519895, + "epoch": 0.8734443576023567, + "flos": 20193935207040.0, + "grad_norm": 1.936928895674092, + "language_loss": 0.7832166, + "learning_rate": 1.6555041168004747e-07, + "loss": 0.80494004, + "num_input_tokens_seen": 157143345, + "step": 7264, + "time_per_iteration": 2.4338228702545166 + }, + { + "auxiliary_loss_clip": 0.01130214, + "auxiliary_loss_mlp": 0.01022533, + "balance_loss_clip": 1.04194391, + "balance_loss_mlp": 1.01580715, + "epoch": 0.8735646004929959, + "flos": 18041162411520.0, + "grad_norm": 1.7579067613630046, + "language_loss": 0.69390976, + "learning_rate": 1.6524023187539715e-07, + "loss": 0.71543723, + "num_input_tokens_seen": 157161630, + "step": 7265, + "time_per_iteration": 2.4513816833496094 + }, + { + "auxiliary_loss_clip": 0.01137891, + "auxiliary_loss_mlp": 0.01023288, + "balance_loss_clip": 1.04442573, + "balance_loss_mlp": 1.01650167, + "epoch": 0.873684843383635, + "flos": 20262344659200.0, + "grad_norm": 2.0068267346133606, + "language_loss": 0.74740982, + "learning_rate": 1.649303304023446e-07, + "loss": 0.76902163, + "num_input_tokens_seen": 157181385, + "step": 7266, + "time_per_iteration": 2.471242666244507 + }, + { + "auxiliary_loss_clip": 0.011177, + "auxiliary_loss_mlp": 0.01022204, + "balance_loss_clip": 1.04360771, + "balance_loss_mlp": 1.01524258, + "epoch": 0.873805086274274, + "flos": 16947287579520.0, + "grad_norm": 1.8667044926312626, + "language_loss": 0.78711295, + "learning_rate": 1.6462070730790246e-07, + "loss": 0.80851197, + "num_input_tokens_seen": 157200545, + "step": 7267, + "time_per_iteration": 2.478363275527954 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01024071, + "balance_loss_clip": 1.0407896, + "balance_loss_mlp": 1.01688588, + "epoch": 0.8739253291649132, + "flos": 18041270152320.0, + "grad_norm": 2.6493317700626657, + "language_loss": 0.7863695, + "learning_rate": 1.6431136263903912e-07, + "loss": 0.80793685, + "num_input_tokens_seen": 157219545, + "step": 7268, + "time_per_iteration": 4.352779865264893 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.00761389, + "balance_loss_clip": 1.04387498, + "balance_loss_mlp": 1.00046408, + "epoch": 0.8740455720555522, + "flos": 21325085377920.0, + "grad_norm": 2.452105333380181, + "language_loss": 0.73743081, + "learning_rate": 1.6400229644268282e-07, + "loss": 0.7565819, + "num_input_tokens_seen": 157237900, + "step": 7269, + "time_per_iteration": 2.4299886226654053 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.04583395, + "balance_loss_mlp": 1.0210557, + "epoch": 0.8741658149461913, + "flos": 15158684822400.0, + "grad_norm": 1.8136251130857342, + "language_loss": 0.8119356, + "learning_rate": 1.6369350876571852e-07, + "loss": 0.83340633, + "num_input_tokens_seen": 157256055, + "step": 7270, + "time_per_iteration": 3.328937292098999 + }, + { + "auxiliary_loss_clip": 0.01105456, + "auxiliary_loss_mlp": 0.01023493, + "balance_loss_clip": 1.0391233, + "balance_loss_mlp": 1.01657021, + "epoch": 0.8742860578368304, + "flos": 23039855729280.0, + "grad_norm": 1.8416639093662597, + "language_loss": 0.81421393, + "learning_rate": 1.6338499965498874e-07, + "loss": 0.83550346, + "num_input_tokens_seen": 157274785, + "step": 7271, + "time_per_iteration": 2.5355899333953857 + }, + { + "auxiliary_loss_clip": 0.01120467, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.04358613, + "balance_loss_mlp": 1.02230835, + "epoch": 0.8744063007274695, + "flos": 28145347159680.0, + "grad_norm": 1.439658537946952, + "language_loss": 0.77447951, + "learning_rate": 1.630767691572943e-07, + "loss": 0.79598451, + "num_input_tokens_seen": 157294805, + "step": 7272, + "time_per_iteration": 2.5608465671539307 + }, + { + "auxiliary_loss_clip": 0.01042944, + "auxiliary_loss_mlp": 0.0100128, + "balance_loss_clip": 1.00839281, + "balance_loss_mlp": 1.00034416, + "epoch": 0.8745265436181086, + "flos": 64034076654720.0, + "grad_norm": 0.7358329192011749, + "language_loss": 0.53526986, + "learning_rate": 1.6276881731939306e-07, + "loss": 0.5557121, + "num_input_tokens_seen": 157356695, + "step": 7273, + "time_per_iteration": 3.1215240955352783 + }, + { + "auxiliary_loss_clip": 0.01146961, + "auxiliary_loss_mlp": 0.0102357, + "balance_loss_clip": 1.04521716, + "balance_loss_mlp": 1.01659966, + "epoch": 0.8746467865087477, + "flos": 28658618553600.0, + "grad_norm": 1.852533439932542, + "language_loss": 0.75572109, + "learning_rate": 1.6246114418800193e-07, + "loss": 0.77742648, + "num_input_tokens_seen": 157376975, + "step": 7274, + "time_per_iteration": 2.499825954437256 + }, + { + "auxiliary_loss_clip": 0.01145811, + "auxiliary_loss_mlp": 0.01026986, + "balance_loss_clip": 1.04466081, + "balance_loss_mlp": 1.01937413, + "epoch": 0.8747670293993868, + "flos": 23985850268160.0, + "grad_norm": 1.7093377120631499, + "language_loss": 0.76604038, + "learning_rate": 1.6215374980979423e-07, + "loss": 0.78776836, + "num_input_tokens_seen": 157397385, + "step": 7275, + "time_per_iteration": 2.4932079315185547 + }, + { + "auxiliary_loss_clip": 0.01145528, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.04585576, + "balance_loss_mlp": 1.02143073, + "epoch": 0.8748872722900258, + "flos": 45221624478720.0, + "grad_norm": 1.9740980187618742, + "language_loss": 0.68492776, + "learning_rate": 1.6184663423140133e-07, + "loss": 0.70666277, + "num_input_tokens_seen": 157417685, + "step": 7276, + "time_per_iteration": 3.3338396549224854 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.04239416, + "balance_loss_mlp": 1.02246654, + "epoch": 0.875007515180665, + "flos": 19754280737280.0, + "grad_norm": 1.7860318129655117, + "language_loss": 0.63926178, + "learning_rate": 1.615397974994126e-07, + "loss": 0.66068155, + "num_input_tokens_seen": 157435490, + "step": 7277, + "time_per_iteration": 2.5295870304107666 + }, + { + "auxiliary_loss_clip": 0.01162282, + "auxiliary_loss_mlp": 0.01022699, + "balance_loss_clip": 1.04633284, + "balance_loss_mlp": 1.01594031, + "epoch": 0.875127758071304, + "flos": 22710734386560.0, + "grad_norm": 1.4724612628082139, + "language_loss": 0.80776554, + "learning_rate": 1.6123323966037438e-07, + "loss": 0.82961535, + "num_input_tokens_seen": 157454010, + "step": 7278, + "time_per_iteration": 2.4248545169830322 + }, + { + "auxiliary_loss_clip": 0.01165152, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.04795313, + "balance_loss_mlp": 1.02484655, + "epoch": 0.8752480009619431, + "flos": 23403846199680.0, + "grad_norm": 2.220548201688487, + "language_loss": 0.78453571, + "learning_rate": 1.6092696076079216e-07, + "loss": 0.80650693, + "num_input_tokens_seen": 157472385, + "step": 7279, + "time_per_iteration": 2.405405044555664 + }, + { + "auxiliary_loss_clip": 0.01111073, + "auxiliary_loss_mlp": 0.01022329, + "balance_loss_clip": 1.04066515, + "balance_loss_mlp": 1.01545322, + "epoch": 0.8753682438525822, + "flos": 26213101914240.0, + "grad_norm": 1.5701699245583525, + "language_loss": 0.74108648, + "learning_rate": 1.6062096084712785e-07, + "loss": 0.76242048, + "num_input_tokens_seen": 157493735, + "step": 7280, + "time_per_iteration": 2.5235345363616943 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.00761459, + "balance_loss_clip": 1.03945374, + "balance_loss_mlp": 1.00047183, + "epoch": 0.8754884867432213, + "flos": 23326745656320.0, + "grad_norm": 2.105885957597929, + "language_loss": 0.70599437, + "learning_rate": 1.6031523996580098e-07, + "loss": 0.72488093, + "num_input_tokens_seen": 157511295, + "step": 7281, + "time_per_iteration": 2.472104549407959 + }, + { + "auxiliary_loss_clip": 0.01131526, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.04429686, + "balance_loss_mlp": 1.01880455, + "epoch": 0.8756087296338604, + "flos": 12495226412160.0, + "grad_norm": 2.5097686155391026, + "language_loss": 0.66421473, + "learning_rate": 1.6000979816318981e-07, + "loss": 0.68579155, + "num_input_tokens_seen": 157529760, + "step": 7282, + "time_per_iteration": 2.4813733100891113 + }, + { + "auxiliary_loss_clip": 0.01144003, + "auxiliary_loss_mlp": 0.01020992, + "balance_loss_clip": 1.04466677, + "balance_loss_mlp": 1.01388156, + "epoch": 0.8757289725244994, + "flos": 18952898353920.0, + "grad_norm": 2.1019711633317932, + "language_loss": 0.75089264, + "learning_rate": 1.5970463548562886e-07, + "loss": 0.7725426, + "num_input_tokens_seen": 157548915, + "step": 7283, + "time_per_iteration": 2.4103548526763916 + }, + { + "auxiliary_loss_clip": 0.01135534, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.0453856, + "balance_loss_mlp": 1.01614642, + "epoch": 0.8758492154151386, + "flos": 25265958140160.0, + "grad_norm": 1.794976923770229, + "language_loss": 0.70912701, + "learning_rate": 1.5939975197941192e-07, + "loss": 0.73071241, + "num_input_tokens_seen": 157570570, + "step": 7284, + "time_per_iteration": 2.501824378967285 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01000643, + "balance_loss_clip": 1.00908566, + "balance_loss_mlp": 0.99974883, + "epoch": 0.8759694583057777, + "flos": 65571664193280.0, + "grad_norm": 0.8480058578704652, + "language_loss": 0.53354669, + "learning_rate": 1.5909514769078892e-07, + "loss": 0.55399466, + "num_input_tokens_seen": 157635675, + "step": 7285, + "time_per_iteration": 3.11342453956604 + }, + { + "auxiliary_loss_clip": 0.01115919, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.04525328, + "balance_loss_mlp": 1.01875901, + "epoch": 0.8760897011964167, + "flos": 25446193608960.0, + "grad_norm": 1.5350151270816048, + "language_loss": 0.77772117, + "learning_rate": 1.5879082266596867e-07, + "loss": 0.79913509, + "num_input_tokens_seen": 157657015, + "step": 7286, + "time_per_iteration": 2.5437259674072266 + }, + { + "auxiliary_loss_clip": 0.01128602, + "auxiliary_loss_mlp": 0.01022283, + "balance_loss_clip": 1.03911042, + "balance_loss_mlp": 1.01504064, + "epoch": 0.8762099440870559, + "flos": 28984830894720.0, + "grad_norm": 1.832594416387367, + "language_loss": 0.71956927, + "learning_rate": 1.5848677695111645e-07, + "loss": 0.74107808, + "num_input_tokens_seen": 157678615, + "step": 7287, + "time_per_iteration": 2.5251834392547607 + }, + { + "auxiliary_loss_clip": 0.01127496, + "auxiliary_loss_mlp": 0.01024736, + "balance_loss_clip": 1.04283404, + "balance_loss_mlp": 1.01744914, + "epoch": 0.8763301869776949, + "flos": 21609461352960.0, + "grad_norm": 2.254454354289077, + "language_loss": 0.69463563, + "learning_rate": 1.5818301059235562e-07, + "loss": 0.71615803, + "num_input_tokens_seen": 157693790, + "step": 7288, + "time_per_iteration": 2.4859535694122314 + }, + { + "auxiliary_loss_clip": 0.01137619, + "auxiliary_loss_mlp": 0.01020425, + "balance_loss_clip": 1.04543161, + "balance_loss_mlp": 1.01324582, + "epoch": 0.876450429868334, + "flos": 24644416176000.0, + "grad_norm": 1.8075034109766241, + "language_loss": 0.813995, + "learning_rate": 1.578795236357684e-07, + "loss": 0.83557546, + "num_input_tokens_seen": 157715255, + "step": 7289, + "time_per_iteration": 2.508932590484619 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01022899, + "balance_loss_clip": 1.04508615, + "balance_loss_mlp": 1.01613379, + "epoch": 0.8765706727589732, + "flos": 20260046188800.0, + "grad_norm": 3.5040755570939806, + "language_loss": 0.85885584, + "learning_rate": 1.5757631612739218e-07, + "loss": 0.8804574, + "num_input_tokens_seen": 157728800, + "step": 7290, + "time_per_iteration": 2.4742484092712402 + }, + { + "auxiliary_loss_clip": 0.01061895, + "auxiliary_loss_mlp": 0.01001848, + "balance_loss_clip": 1.0079484, + "balance_loss_mlp": 1.00092435, + "epoch": 0.8766909156496122, + "flos": 71371165276800.0, + "grad_norm": 0.7815669485717385, + "language_loss": 0.61447883, + "learning_rate": 1.572733881132242e-07, + "loss": 0.63511622, + "num_input_tokens_seen": 157789445, + "step": 7291, + "time_per_iteration": 3.063943386077881 + }, + { + "auxiliary_loss_clip": 0.01028644, + "auxiliary_loss_mlp": 0.01002972, + "balance_loss_clip": 1.01076031, + "balance_loss_mlp": 1.00190496, + "epoch": 0.8768111585402513, + "flos": 69523490603520.0, + "grad_norm": 0.7819245990748084, + "language_loss": 0.58509946, + "learning_rate": 1.5697073963921814e-07, + "loss": 0.60541558, + "num_input_tokens_seen": 157848685, + "step": 7292, + "time_per_iteration": 3.016230344772339 + }, + { + "auxiliary_loss_clip": 0.01151952, + "auxiliary_loss_mlp": 0.01018753, + "balance_loss_clip": 1.04669309, + "balance_loss_mlp": 1.01141024, + "epoch": 0.8769314014308904, + "flos": 18838558385280.0, + "grad_norm": 2.264413964133642, + "language_loss": 0.84785247, + "learning_rate": 1.566683707512857e-07, + "loss": 0.86955953, + "num_input_tokens_seen": 157866360, + "step": 7293, + "time_per_iteration": 2.421010971069336 + }, + { + "auxiliary_loss_clip": 0.0113355, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.04351854, + "balance_loss_mlp": 1.02105916, + "epoch": 0.8770516443215295, + "flos": 14976402278400.0, + "grad_norm": 1.9243735443994014, + "language_loss": 0.79577112, + "learning_rate": 1.5636628149529553e-07, + "loss": 0.81739438, + "num_input_tokens_seen": 157884150, + "step": 7294, + "time_per_iteration": 3.179569959640503 + }, + { + "auxiliary_loss_clip": 0.01133557, + "auxiliary_loss_mlp": 0.01022716, + "balance_loss_clip": 1.04266167, + "balance_loss_mlp": 1.01615024, + "epoch": 0.8771718872121685, + "flos": 31649654021760.0, + "grad_norm": 2.573076373108681, + "language_loss": 0.79259431, + "learning_rate": 1.560644719170743e-07, + "loss": 0.81415707, + "num_input_tokens_seen": 157905020, + "step": 7295, + "time_per_iteration": 3.405374765396118 + }, + { + "auxiliary_loss_clip": 0.01120247, + "auxiliary_loss_mlp": 0.01025591, + "balance_loss_clip": 1.04078662, + "balance_loss_mlp": 1.01785469, + "epoch": 0.8772921301028077, + "flos": 36095466222720.0, + "grad_norm": 1.7139196897832787, + "language_loss": 0.72097123, + "learning_rate": 1.5576294206240692e-07, + "loss": 0.74242961, + "num_input_tokens_seen": 157924545, + "step": 7296, + "time_per_iteration": 2.63200306892395 + }, + { + "auxiliary_loss_clip": 0.01134486, + "auxiliary_loss_mlp": 0.01024436, + "balance_loss_clip": 1.04464006, + "balance_loss_mlp": 1.01770353, + "epoch": 0.8774123729934468, + "flos": 57116961849600.0, + "grad_norm": 1.599752716293309, + "language_loss": 0.67424309, + "learning_rate": 1.5546169197703507e-07, + "loss": 0.69583237, + "num_input_tokens_seen": 157950820, + "step": 7297, + "time_per_iteration": 3.6132898330688477 + }, + { + "auxiliary_loss_clip": 0.01141044, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.04180646, + "balance_loss_mlp": 1.0195744, + "epoch": 0.8775326158840858, + "flos": 23914495900800.0, + "grad_norm": 2.819588830314426, + "language_loss": 0.77361566, + "learning_rate": 1.5516072170665774e-07, + "loss": 0.79529035, + "num_input_tokens_seen": 157968790, + "step": 7298, + "time_per_iteration": 2.4898064136505127 + }, + { + "auxiliary_loss_clip": 0.01151527, + "auxiliary_loss_mlp": 0.01020035, + "balance_loss_clip": 1.04496181, + "balance_loss_mlp": 1.01342487, + "epoch": 0.877652858774725, + "flos": 17123285243520.0, + "grad_norm": 1.9497699785209313, + "language_loss": 0.86900139, + "learning_rate": 1.5486003129693214e-07, + "loss": 0.89071703, + "num_input_tokens_seen": 157986155, + "step": 7299, + "time_per_iteration": 2.4063479900360107 + }, + { + "auxiliary_loss_clip": 0.01151676, + "auxiliary_loss_mlp": 0.01021452, + "balance_loss_clip": 1.04569602, + "balance_loss_mlp": 1.01438928, + "epoch": 0.877773101665364, + "flos": 16508961912960.0, + "grad_norm": 1.9450436910258881, + "language_loss": 0.78161234, + "learning_rate": 1.545596207934725e-07, + "loss": 0.80334359, + "num_input_tokens_seen": 158004640, + "step": 7300, + "time_per_iteration": 2.410071611404419 + }, + { + "auxiliary_loss_clip": 0.01128689, + "auxiliary_loss_mlp": 0.0102387, + "balance_loss_clip": 1.04172814, + "balance_loss_mlp": 1.01659501, + "epoch": 0.8778933445560031, + "flos": 22053209973120.0, + "grad_norm": 1.7879151151134898, + "language_loss": 0.77681673, + "learning_rate": 1.5425949024185147e-07, + "loss": 0.79834229, + "num_input_tokens_seen": 158024665, + "step": 7301, + "time_per_iteration": 2.486607313156128 + }, + { + "auxiliary_loss_clip": 0.01138102, + "auxiliary_loss_mlp": 0.01025856, + "balance_loss_clip": 1.04291797, + "balance_loss_mlp": 1.01881361, + "epoch": 0.8780135874466423, + "flos": 22564757514240.0, + "grad_norm": 2.075972349259541, + "language_loss": 0.67669159, + "learning_rate": 1.5395963968759818e-07, + "loss": 0.69833124, + "num_input_tokens_seen": 158044940, + "step": 7302, + "time_per_iteration": 3.2607688903808594 + }, + { + "auxiliary_loss_clip": 0.01137313, + "auxiliary_loss_mlp": 0.01020815, + "balance_loss_clip": 1.04224372, + "balance_loss_mlp": 1.01370716, + "epoch": 0.8781338303372813, + "flos": 61531999073280.0, + "grad_norm": 1.5629112086878276, + "language_loss": 0.64401269, + "learning_rate": 1.536600691761998e-07, + "loss": 0.66559398, + "num_input_tokens_seen": 158070770, + "step": 7303, + "time_per_iteration": 2.8303260803222656 + }, + { + "auxiliary_loss_clip": 0.01127785, + "auxiliary_loss_mlp": 0.0102383, + "balance_loss_clip": 1.04722524, + "balance_loss_mlp": 1.01747036, + "epoch": 0.8782540732279204, + "flos": 22674751937280.0, + "grad_norm": 2.908706747026936, + "language_loss": 0.71378052, + "learning_rate": 1.5336077875310084e-07, + "loss": 0.73529673, + "num_input_tokens_seen": 158089995, + "step": 7304, + "time_per_iteration": 2.513988971710205 + }, + { + "auxiliary_loss_clip": 0.01112788, + "auxiliary_loss_mlp": 0.01021601, + "balance_loss_clip": 1.04157197, + "balance_loss_mlp": 1.01475227, + "epoch": 0.8783743161185595, + "flos": 16070348937600.0, + "grad_norm": 1.8643091032824237, + "language_loss": 0.74091262, + "learning_rate": 1.5306176846370321e-07, + "loss": 0.7622565, + "num_input_tokens_seen": 158108140, + "step": 7305, + "time_per_iteration": 2.500054121017456 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.04369247, + "balance_loss_mlp": 1.0251832, + "epoch": 0.8784945590091986, + "flos": 26067879227520.0, + "grad_norm": 2.006700364624822, + "language_loss": 0.74064058, + "learning_rate": 1.5276303835336712e-07, + "loss": 0.76240993, + "num_input_tokens_seen": 158128680, + "step": 7306, + "time_per_iteration": 2.5232291221618652 + }, + { + "auxiliary_loss_clip": 0.01053913, + "auxiliary_loss_mlp": 0.01000644, + "balance_loss_clip": 1.00853157, + "balance_loss_mlp": 0.99977398, + "epoch": 0.8786148018998376, + "flos": 62720643939840.0, + "grad_norm": 0.7645305825577966, + "language_loss": 0.53551519, + "learning_rate": 1.524645884674094e-07, + "loss": 0.55606073, + "num_input_tokens_seen": 158185610, + "step": 7307, + "time_per_iteration": 3.014658212661743 + }, + { + "auxiliary_loss_clip": 0.01164924, + "auxiliary_loss_mlp": 0.00762419, + "balance_loss_clip": 1.04495406, + "balance_loss_mlp": 1.00043738, + "epoch": 0.8787350447904768, + "flos": 21652734263040.0, + "grad_norm": 3.863340952209873, + "language_loss": 0.79001743, + "learning_rate": 1.521664188511047e-07, + "loss": 0.80929089, + "num_input_tokens_seen": 158205635, + "step": 7308, + "time_per_iteration": 2.4349019527435303 + }, + { + "auxiliary_loss_clip": 0.01139687, + "auxiliary_loss_mlp": 0.00761752, + "balance_loss_clip": 1.04875147, + "balance_loss_mlp": 1.00054598, + "epoch": 0.8788552876811159, + "flos": 25478476957440.0, + "grad_norm": 1.8669475001719467, + "language_loss": 0.80641675, + "learning_rate": 1.518685295496851e-07, + "loss": 0.82543111, + "num_input_tokens_seen": 158223495, + "step": 7309, + "time_per_iteration": 2.5166094303131104 + }, + { + "auxiliary_loss_clip": 0.0115075, + "auxiliary_loss_mlp": 0.01022878, + "balance_loss_clip": 1.04389, + "balance_loss_mlp": 1.01619339, + "epoch": 0.8789755305717549, + "flos": 22310222762880.0, + "grad_norm": 1.5892033441673044, + "language_loss": 0.85107481, + "learning_rate": 1.5157092060833975e-07, + "loss": 0.87281114, + "num_input_tokens_seen": 158243145, + "step": 7310, + "time_per_iteration": 2.471359968185425 + }, + { + "auxiliary_loss_clip": 0.01134144, + "auxiliary_loss_mlp": 0.01018158, + "balance_loss_clip": 1.04197431, + "balance_loss_mlp": 1.01142299, + "epoch": 0.879095773462394, + "flos": 29310971408640.0, + "grad_norm": 1.524385952489216, + "language_loss": 0.65837979, + "learning_rate": 1.5127359207221658e-07, + "loss": 0.67990279, + "num_input_tokens_seen": 158262625, + "step": 7311, + "time_per_iteration": 2.5396535396575928 + }, + { + "auxiliary_loss_clip": 0.01083491, + "auxiliary_loss_mlp": 0.0102234, + "balance_loss_clip": 1.03510821, + "balance_loss_mlp": 1.01461518, + "epoch": 0.8792160163530331, + "flos": 16690023394560.0, + "grad_norm": 1.8527610637499978, + "language_loss": 0.73353505, + "learning_rate": 1.5097654398641923e-07, + "loss": 0.75459343, + "num_input_tokens_seen": 158280530, + "step": 7312, + "time_per_iteration": 2.576570987701416 + }, + { + "auxiliary_loss_clip": 0.01155488, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.04616594, + "balance_loss_mlp": 1.01764083, + "epoch": 0.8793362592436722, + "flos": 24499301230080.0, + "grad_norm": 1.5507347272198129, + "language_loss": 0.73067212, + "learning_rate": 1.5067977639601014e-07, + "loss": 0.75247753, + "num_input_tokens_seen": 158303290, + "step": 7313, + "time_per_iteration": 2.491948366165161 + }, + { + "auxiliary_loss_clip": 0.0113365, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.04443944, + "balance_loss_mlp": 1.01734138, + "epoch": 0.8794565021343113, + "flos": 14538399834240.0, + "grad_norm": 3.035761442661135, + "language_loss": 0.71191573, + "learning_rate": 1.5038328934600864e-07, + "loss": 0.73349553, + "num_input_tokens_seen": 158319925, + "step": 7314, + "time_per_iteration": 2.4762117862701416 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.01025895, + "balance_loss_clip": 1.04428303, + "balance_loss_mlp": 1.01915979, + "epoch": 0.8795767450249504, + "flos": 39530286224640.0, + "grad_norm": 2.275108579227439, + "language_loss": 0.70001376, + "learning_rate": 1.5008708288139161e-07, + "loss": 0.72161543, + "num_input_tokens_seen": 158342285, + "step": 7315, + "time_per_iteration": 2.642104387283325 + }, + { + "auxiliary_loss_clip": 0.01151542, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.04667854, + "balance_loss_mlp": 1.01880705, + "epoch": 0.8796969879155895, + "flos": 22960672197120.0, + "grad_norm": 1.907857171125672, + "language_loss": 0.73190546, + "learning_rate": 1.497911570470931e-07, + "loss": 0.75368077, + "num_input_tokens_seen": 158362290, + "step": 7316, + "time_per_iteration": 2.471937894821167 + }, + { + "auxiliary_loss_clip": 0.01112257, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.04162276, + "balance_loss_mlp": 1.02133727, + "epoch": 0.8798172308062285, + "flos": 28362427004160.0, + "grad_norm": 1.7013615199981986, + "language_loss": 0.85545564, + "learning_rate": 1.494955118880048e-07, + "loss": 0.87686414, + "num_input_tokens_seen": 158383275, + "step": 7317, + "time_per_iteration": 2.5632169246673584 + }, + { + "auxiliary_loss_clip": 0.01150999, + "auxiliary_loss_mlp": 0.01025349, + "balance_loss_clip": 1.04445624, + "balance_loss_mlp": 1.01834583, + "epoch": 0.8799374736968677, + "flos": 23988974751360.0, + "grad_norm": 1.6072975066153066, + "language_loss": 0.72797132, + "learning_rate": 1.4920014744897634e-07, + "loss": 0.74973476, + "num_input_tokens_seen": 158402690, + "step": 7318, + "time_per_iteration": 2.5129966735839844 + }, + { + "auxiliary_loss_clip": 0.01127804, + "auxiliary_loss_mlp": 0.01018451, + "balance_loss_clip": 1.04282677, + "balance_loss_mlp": 1.01139379, + "epoch": 0.8800577165875068, + "flos": 25630271832960.0, + "grad_norm": 1.9919640693505132, + "language_loss": 0.86248398, + "learning_rate": 1.4890506377481392e-07, + "loss": 0.88394654, + "num_input_tokens_seen": 158421780, + "step": 7319, + "time_per_iteration": 2.5378336906433105 + }, + { + "auxiliary_loss_clip": 0.01094993, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.04215813, + "balance_loss_mlp": 1.02373707, + "epoch": 0.8801779594781458, + "flos": 23440331439360.0, + "grad_norm": 1.598586831054208, + "language_loss": 0.6399948, + "learning_rate": 1.486102609102815e-07, + "loss": 0.66125, + "num_input_tokens_seen": 158442330, + "step": 7320, + "time_per_iteration": 2.613802433013916 + }, + { + "auxiliary_loss_clip": 0.01129238, + "auxiliary_loss_mlp": 0.01023758, + "balance_loss_clip": 1.04291725, + "balance_loss_mlp": 1.01688814, + "epoch": 0.880298202368785, + "flos": 11508580656000.0, + "grad_norm": 2.4365163959647145, + "language_loss": 0.8557207, + "learning_rate": 1.483157389001004e-07, + "loss": 0.87725067, + "num_input_tokens_seen": 158459890, + "step": 7321, + "time_per_iteration": 3.2902450561523438 + }, + { + "auxiliary_loss_clip": 0.01136308, + "auxiliary_loss_mlp": 0.01022919, + "balance_loss_clip": 1.04140711, + "balance_loss_mlp": 1.01521778, + "epoch": 0.880418445259424, + "flos": 22671447886080.0, + "grad_norm": 2.066468582129408, + "language_loss": 0.78696632, + "learning_rate": 1.4802149778894933e-07, + "loss": 0.80855858, + "num_input_tokens_seen": 158478680, + "step": 7322, + "time_per_iteration": 2.4976484775543213 + }, + { + "auxiliary_loss_clip": 0.01140318, + "auxiliary_loss_mlp": 0.0102217, + "balance_loss_clip": 1.04095674, + "balance_loss_mlp": 1.01589322, + "epoch": 0.8805386881500631, + "flos": 20522158709760.0, + "grad_norm": 1.7618588769863543, + "language_loss": 0.87580609, + "learning_rate": 1.4772753762146484e-07, + "loss": 0.89743096, + "num_input_tokens_seen": 158497935, + "step": 7323, + "time_per_iteration": 2.436676502227783 + }, + { + "auxiliary_loss_clip": 0.01145624, + "auxiliary_loss_mlp": 0.01019832, + "balance_loss_clip": 1.0443821, + "balance_loss_mlp": 1.01225281, + "epoch": 0.8806589310407023, + "flos": 36538891620480.0, + "grad_norm": 1.591509823588181, + "language_loss": 0.70549273, + "learning_rate": 1.474338584422401e-07, + "loss": 0.72714734, + "num_input_tokens_seen": 158523145, + "step": 7324, + "time_per_iteration": 3.442779779434204 + }, + { + "auxiliary_loss_clip": 0.01145237, + "auxiliary_loss_mlp": 0.01020684, + "balance_loss_clip": 1.04453063, + "balance_loss_mlp": 1.01381409, + "epoch": 0.8807791739313413, + "flos": 23440187784960.0, + "grad_norm": 1.6276190756949633, + "language_loss": 0.75873756, + "learning_rate": 1.4714046029582595e-07, + "loss": 0.78039676, + "num_input_tokens_seen": 158542210, + "step": 7325, + "time_per_iteration": 2.455404758453369 + }, + { + "auxiliary_loss_clip": 0.01124952, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.0425694, + "balance_loss_mlp": 1.01545227, + "epoch": 0.8808994168219804, + "flos": 25956843310080.0, + "grad_norm": 1.818360019062367, + "language_loss": 0.75533503, + "learning_rate": 1.46847343226731e-07, + "loss": 0.77681148, + "num_input_tokens_seen": 158563250, + "step": 7326, + "time_per_iteration": 2.552887439727783 + }, + { + "auxiliary_loss_clip": 0.01151537, + "auxiliary_loss_mlp": 0.01023485, + "balance_loss_clip": 1.04391599, + "balance_loss_mlp": 1.01612389, + "epoch": 0.8810196597126195, + "flos": 17092079303040.0, + "grad_norm": 2.0285178791236946, + "language_loss": 0.69577301, + "learning_rate": 1.465545072794203e-07, + "loss": 0.71752322, + "num_input_tokens_seen": 158581125, + "step": 7327, + "time_per_iteration": 2.424048900604248 + }, + { + "auxiliary_loss_clip": 0.01107508, + "auxiliary_loss_mlp": 0.01023891, + "balance_loss_clip": 1.04375744, + "balance_loss_mlp": 1.01673532, + "epoch": 0.8811399026032586, + "flos": 23002831785600.0, + "grad_norm": 2.5769298564056418, + "language_loss": 0.75683004, + "learning_rate": 1.4626195249831774e-07, + "loss": 0.778144, + "num_input_tokens_seen": 158602025, + "step": 7328, + "time_per_iteration": 2.5792641639709473 + }, + { + "auxiliary_loss_clip": 0.01148546, + "auxiliary_loss_mlp": 0.01021431, + "balance_loss_clip": 1.04422379, + "balance_loss_mlp": 1.0146066, + "epoch": 0.8812601454938976, + "flos": 14463813242880.0, + "grad_norm": 1.9528449440150601, + "language_loss": 0.71760786, + "learning_rate": 1.4596967892780244e-07, + "loss": 0.73930764, + "num_input_tokens_seen": 158618355, + "step": 7329, + "time_per_iteration": 3.194915771484375 + }, + { + "auxiliary_loss_clip": 0.01162686, + "auxiliary_loss_mlp": 0.01023766, + "balance_loss_clip": 1.04621303, + "balance_loss_mlp": 1.01692367, + "epoch": 0.8813803883845368, + "flos": 22493223578880.0, + "grad_norm": 1.6494222605649067, + "language_loss": 0.74884796, + "learning_rate": 1.4567768661221314e-07, + "loss": 0.77071249, + "num_input_tokens_seen": 158638925, + "step": 7330, + "time_per_iteration": 2.4850776195526123 + }, + { + "auxiliary_loss_clip": 0.01155302, + "auxiliary_loss_mlp": 0.00761926, + "balance_loss_clip": 1.04727781, + "balance_loss_mlp": 1.00046337, + "epoch": 0.8815006312751759, + "flos": 21506901045120.0, + "grad_norm": 1.9400371862733354, + "language_loss": 0.74573803, + "learning_rate": 1.4538597559584442e-07, + "loss": 0.76491034, + "num_input_tokens_seen": 158656715, + "step": 7331, + "time_per_iteration": 2.478764295578003 + }, + { + "auxiliary_loss_clip": 0.01133124, + "auxiliary_loss_mlp": 0.01025317, + "balance_loss_clip": 1.04287696, + "balance_loss_mlp": 1.01793778, + "epoch": 0.8816208741658149, + "flos": 22784566792320.0, + "grad_norm": 2.3234348253468395, + "language_loss": 0.79236937, + "learning_rate": 1.4509454592294823e-07, + "loss": 0.81395376, + "num_input_tokens_seen": 158677200, + "step": 7332, + "time_per_iteration": 2.5042765140533447 + }, + { + "auxiliary_loss_clip": 0.01126054, + "auxiliary_loss_mlp": 0.00761949, + "balance_loss_clip": 1.04570723, + "balance_loss_mlp": 1.00043428, + "epoch": 0.8817411170564541, + "flos": 17779409026560.0, + "grad_norm": 1.9214944616600578, + "language_loss": 0.79064822, + "learning_rate": 1.448033976377354e-07, + "loss": 0.80952829, + "num_input_tokens_seen": 158692185, + "step": 7333, + "time_per_iteration": 2.497204303741455 + }, + { + "auxiliary_loss_clip": 0.01151412, + "auxiliary_loss_mlp": 0.01020055, + "balance_loss_clip": 1.0434835, + "balance_loss_mlp": 1.01328373, + "epoch": 0.8818613599470931, + "flos": 18551812112640.0, + "grad_norm": 1.8108215662815614, + "language_loss": 0.7412011, + "learning_rate": 1.445125307843713e-07, + "loss": 0.76291573, + "num_input_tokens_seen": 158710410, + "step": 7334, + "time_per_iteration": 2.422093629837036 + }, + { + "auxiliary_loss_clip": 0.01149064, + "auxiliary_loss_mlp": 0.01021286, + "balance_loss_clip": 1.04654717, + "balance_loss_mlp": 1.01507831, + "epoch": 0.8819816028377322, + "flos": 27599792417280.0, + "grad_norm": 1.5808760505769883, + "language_loss": 0.75867236, + "learning_rate": 1.442219454069813e-07, + "loss": 0.78037584, + "num_input_tokens_seen": 158731435, + "step": 7335, + "time_per_iteration": 2.5133044719696045 + }, + { + "auxiliary_loss_clip": 0.01110849, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.0421983, + "balance_loss_mlp": 1.0197438, + "epoch": 0.8821018457283714, + "flos": 23404600385280.0, + "grad_norm": 2.1464745733324277, + "language_loss": 0.66530502, + "learning_rate": 1.4393164154964676e-07, + "loss": 0.68667817, + "num_input_tokens_seen": 158750965, + "step": 7336, + "time_per_iteration": 2.555781364440918 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01025165, + "balance_loss_clip": 1.04768014, + "balance_loss_mlp": 1.01837564, + "epoch": 0.8822220886190104, + "flos": 29132459792640.0, + "grad_norm": 2.126371595911774, + "language_loss": 0.94086659, + "learning_rate": 1.4364161925640649e-07, + "loss": 0.96262175, + "num_input_tokens_seen": 158772365, + "step": 7337, + "time_per_iteration": 2.5190610885620117 + }, + { + "auxiliary_loss_clip": 0.01163945, + "auxiliary_loss_mlp": 0.01022252, + "balance_loss_clip": 1.04698277, + "balance_loss_mlp": 1.01548684, + "epoch": 0.8823423315096495, + "flos": 20485422074880.0, + "grad_norm": 1.8771424008116642, + "language_loss": 0.85013425, + "learning_rate": 1.4335187857125663e-07, + "loss": 0.87199628, + "num_input_tokens_seen": 158791065, + "step": 7338, + "time_per_iteration": 2.4469211101531982 + }, + { + "auxiliary_loss_clip": 0.01151542, + "auxiliary_loss_mlp": 0.01020202, + "balance_loss_clip": 1.04482007, + "balance_loss_mlp": 1.01351452, + "epoch": 0.8824625744002886, + "flos": 24206377818240.0, + "grad_norm": 1.707140800204605, + "language_loss": 0.75492656, + "learning_rate": 1.4306241953815023e-07, + "loss": 0.77664405, + "num_input_tokens_seen": 158812125, + "step": 7339, + "time_per_iteration": 2.481071949005127 + }, + { + "auxiliary_loss_clip": 0.01152356, + "auxiliary_loss_mlp": 0.01021534, + "balance_loss_clip": 1.04634786, + "balance_loss_mlp": 1.01454782, + "epoch": 0.8825828172909277, + "flos": 24679500785280.0, + "grad_norm": 1.8240121142624837, + "language_loss": 0.70930111, + "learning_rate": 1.4277324220099862e-07, + "loss": 0.73104006, + "num_input_tokens_seen": 158834035, + "step": 7340, + "time_per_iteration": 2.4850611686706543 + }, + { + "auxiliary_loss_clip": 0.01116506, + "auxiliary_loss_mlp": 0.010231, + "balance_loss_clip": 1.04049945, + "balance_loss_mlp": 1.01630521, + "epoch": 0.8827030601815667, + "flos": 22456163721600.0, + "grad_norm": 1.9610634645826541, + "language_loss": 0.7435261, + "learning_rate": 1.4248434660366938e-07, + "loss": 0.76492214, + "num_input_tokens_seen": 158853510, + "step": 7341, + "time_per_iteration": 2.502406358718872 + }, + { + "auxiliary_loss_clip": 0.01134861, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.04488158, + "balance_loss_mlp": 1.01854336, + "epoch": 0.8828233030722058, + "flos": 19865639877120.0, + "grad_norm": 2.2529110446295153, + "language_loss": 0.70449144, + "learning_rate": 1.4219573278998808e-07, + "loss": 0.7260952, + "num_input_tokens_seen": 158871970, + "step": 7342, + "time_per_iteration": 2.465952157974243 + }, + { + "auxiliary_loss_clip": 0.01134606, + "auxiliary_loss_mlp": 0.01023129, + "balance_loss_clip": 1.04135799, + "balance_loss_mlp": 1.01561308, + "epoch": 0.882943545962845, + "flos": 39347213581440.0, + "grad_norm": 2.4144026349215286, + "language_loss": 0.65100288, + "learning_rate": 1.4190740080373685e-07, + "loss": 0.67258024, + "num_input_tokens_seen": 158892250, + "step": 7343, + "time_per_iteration": 2.651212453842163 + }, + { + "auxiliary_loss_clip": 0.01108483, + "auxiliary_loss_mlp": 0.01024688, + "balance_loss_clip": 1.04243755, + "balance_loss_mlp": 1.01709437, + "epoch": 0.883063788853484, + "flos": 19054524908160.0, + "grad_norm": 1.9286691086263699, + "language_loss": 0.84236878, + "learning_rate": 1.4161935068865538e-07, + "loss": 0.86370045, + "num_input_tokens_seen": 158907395, + "step": 7344, + "time_per_iteration": 2.551642656326294 + }, + { + "auxiliary_loss_clip": 0.01164032, + "auxiliary_loss_mlp": 0.01021445, + "balance_loss_clip": 1.0461843, + "balance_loss_mlp": 1.01421499, + "epoch": 0.8831840317441231, + "flos": 18733196816640.0, + "grad_norm": 2.1069957650828313, + "language_loss": 0.75574142, + "learning_rate": 1.4133158248844113e-07, + "loss": 0.77759618, + "num_input_tokens_seen": 158926300, + "step": 7345, + "time_per_iteration": 2.3864638805389404 + }, + { + "auxiliary_loss_clip": 0.01126088, + "auxiliary_loss_mlp": 0.01023549, + "balance_loss_clip": 1.04333997, + "balance_loss_mlp": 1.01577067, + "epoch": 0.8833042746347622, + "flos": 26827712553600.0, + "grad_norm": 2.282310599232799, + "language_loss": 0.73526591, + "learning_rate": 1.4104409624674785e-07, + "loss": 0.75676233, + "num_input_tokens_seen": 158946085, + "step": 7346, + "time_per_iteration": 2.5429930686950684 + }, + { + "auxiliary_loss_clip": 0.01152918, + "auxiliary_loss_mlp": 0.01019155, + "balance_loss_clip": 1.04880488, + "balance_loss_mlp": 1.01244402, + "epoch": 0.8834245175254013, + "flos": 26104077158400.0, + "grad_norm": 1.8701023507103074, + "language_loss": 0.78414667, + "learning_rate": 1.407568920071873e-07, + "loss": 0.80586743, + "num_input_tokens_seen": 158964950, + "step": 7347, + "time_per_iteration": 2.4870550632476807 + }, + { + "auxiliary_loss_clip": 0.01169903, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.04780555, + "balance_loss_mlp": 1.01800549, + "epoch": 0.8835447604160404, + "flos": 30629036977920.0, + "grad_norm": 1.8127838562903202, + "language_loss": 0.68040216, + "learning_rate": 1.4046996981332782e-07, + "loss": 0.70235622, + "num_input_tokens_seen": 158984835, + "step": 7348, + "time_per_iteration": 3.5452449321746826 + }, + { + "auxiliary_loss_clip": 0.01125377, + "auxiliary_loss_mlp": 0.01022474, + "balance_loss_clip": 1.04293323, + "balance_loss_mlp": 1.01482034, + "epoch": 0.8836650033066795, + "flos": 24718356322560.0, + "grad_norm": 1.8635928875697585, + "language_loss": 0.77988631, + "learning_rate": 1.4018332970869516e-07, + "loss": 0.80136484, + "num_input_tokens_seen": 159002775, + "step": 7349, + "time_per_iteration": 2.5430126190185547 + }, + { + "auxiliary_loss_clip": 0.01132065, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.04517722, + "balance_loss_mlp": 1.01832163, + "epoch": 0.8837852461973186, + "flos": 25413371556480.0, + "grad_norm": 1.8206539258283447, + "language_loss": 0.85138249, + "learning_rate": 1.398969717367733e-07, + "loss": 0.87296033, + "num_input_tokens_seen": 159024100, + "step": 7350, + "time_per_iteration": 3.366276264190674 + }, + { + "auxiliary_loss_clip": 0.01108142, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.04441047, + "balance_loss_mlp": 1.01903009, + "epoch": 0.8839054890879576, + "flos": 17822574195840.0, + "grad_norm": 2.448918069540013, + "language_loss": 0.75964421, + "learning_rate": 1.396108959410014e-07, + "loss": 0.78098363, + "num_input_tokens_seen": 159043315, + "step": 7351, + "time_per_iteration": 2.516375780105591 + }, + { + "auxiliary_loss_clip": 0.0115221, + "auxiliary_loss_mlp": 0.00761962, + "balance_loss_clip": 1.04712796, + "balance_loss_mlp": 1.00036573, + "epoch": 0.8840257319785968, + "flos": 23769021818880.0, + "grad_norm": 1.7945940777075668, + "language_loss": 0.81374007, + "learning_rate": 1.3932510236477745e-07, + "loss": 0.83288181, + "num_input_tokens_seen": 159063985, + "step": 7352, + "time_per_iteration": 2.462893486022949 + }, + { + "auxiliary_loss_clip": 0.0114889, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.04266739, + "balance_loss_mlp": 1.01796532, + "epoch": 0.8841459748692359, + "flos": 29059776622080.0, + "grad_norm": 1.7131172184681667, + "language_loss": 0.5630846, + "learning_rate": 1.3903959105145636e-07, + "loss": 0.58482969, + "num_input_tokens_seen": 159084475, + "step": 7353, + "time_per_iteration": 2.5107421875 + }, + { + "auxiliary_loss_clip": 0.01164539, + "auxiliary_loss_mlp": 0.01023206, + "balance_loss_clip": 1.04688239, + "balance_loss_mlp": 1.01627707, + "epoch": 0.8842662177598749, + "flos": 24311523905280.0, + "grad_norm": 2.3033963809454545, + "language_loss": 0.83300853, + "learning_rate": 1.387543620443492e-07, + "loss": 0.85488594, + "num_input_tokens_seen": 159101320, + "step": 7354, + "time_per_iteration": 2.418693780899048 + }, + { + "auxiliary_loss_clip": 0.01164544, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.04751563, + "balance_loss_mlp": 1.01733816, + "epoch": 0.8843864606505141, + "flos": 25007867942400.0, + "grad_norm": 1.6967419519665319, + "language_loss": 0.84051484, + "learning_rate": 1.3846941538672606e-07, + "loss": 0.86240351, + "num_input_tokens_seen": 159120025, + "step": 7355, + "time_per_iteration": 2.4436275959014893 + }, + { + "auxiliary_loss_clip": 0.01112892, + "auxiliary_loss_mlp": 0.01023019, + "balance_loss_clip": 1.04326117, + "balance_loss_mlp": 1.01636386, + "epoch": 0.8845067035411531, + "flos": 28183915388160.0, + "grad_norm": 2.1474783124500516, + "language_loss": 0.80969095, + "learning_rate": 1.3818475112181193e-07, + "loss": 0.8310501, + "num_input_tokens_seen": 159138820, + "step": 7356, + "time_per_iteration": 3.369537830352783 + }, + { + "auxiliary_loss_clip": 0.01135514, + "auxiliary_loss_mlp": 0.01023688, + "balance_loss_clip": 1.04359782, + "balance_loss_mlp": 1.01730442, + "epoch": 0.8846269464317922, + "flos": 12853219311360.0, + "grad_norm": 2.0950300963596704, + "language_loss": 0.80003589, + "learning_rate": 1.3790036929279091e-07, + "loss": 0.82162786, + "num_input_tokens_seen": 159155975, + "step": 7357, + "time_per_iteration": 2.47066593170166 + }, + { + "auxiliary_loss_clip": 0.01154914, + "auxiliary_loss_mlp": 0.00762031, + "balance_loss_clip": 1.04783964, + "balance_loss_mlp": 1.00032699, + "epoch": 0.8847471893224313, + "flos": 18624351628800.0, + "grad_norm": 2.2044592804261054, + "language_loss": 0.5883671, + "learning_rate": 1.3761626994280363e-07, + "loss": 0.60753655, + "num_input_tokens_seen": 159173445, + "step": 7358, + "time_per_iteration": 2.4363293647766113 + }, + { + "auxiliary_loss_clip": 0.01129126, + "auxiliary_loss_mlp": 0.01023427, + "balance_loss_clip": 1.04412818, + "balance_loss_mlp": 1.01628006, + "epoch": 0.8848674322130704, + "flos": 35769433449600.0, + "grad_norm": 1.698533001841016, + "language_loss": 0.73515761, + "learning_rate": 1.3733245311494735e-07, + "loss": 0.75668311, + "num_input_tokens_seen": 159196100, + "step": 7359, + "time_per_iteration": 2.6968295574188232 + }, + { + "auxiliary_loss_clip": 0.01153576, + "auxiliary_loss_mlp": 0.01024562, + "balance_loss_clip": 1.04804122, + "balance_loss_mlp": 1.01734066, + "epoch": 0.8849876751037095, + "flos": 24243760897920.0, + "grad_norm": 3.9645384812960938, + "language_loss": 0.70654458, + "learning_rate": 1.3704891885227676e-07, + "loss": 0.72832602, + "num_input_tokens_seen": 159216145, + "step": 7360, + "time_per_iteration": 2.474438428878784 + }, + { + "auxiliary_loss_clip": 0.0112333, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.04123974, + "balance_loss_mlp": 1.02166462, + "epoch": 0.8851079179943486, + "flos": 21500580251520.0, + "grad_norm": 1.8429630333122524, + "language_loss": 0.78026068, + "learning_rate": 1.367656671978037e-07, + "loss": 0.80179018, + "num_input_tokens_seen": 159233610, + "step": 7361, + "time_per_iteration": 2.5289735794067383 + }, + { + "auxiliary_loss_clip": 0.01142969, + "auxiliary_loss_mlp": 0.01025879, + "balance_loss_clip": 1.04348564, + "balance_loss_mlp": 1.01915514, + "epoch": 0.8852281608849877, + "flos": 15300711198720.0, + "grad_norm": 1.9542397882552758, + "language_loss": 0.73432469, + "learning_rate": 1.36482698194498e-07, + "loss": 0.75601315, + "num_input_tokens_seen": 159250155, + "step": 7362, + "time_per_iteration": 2.4433975219726562 + }, + { + "auxiliary_loss_clip": 0.01136594, + "auxiliary_loss_mlp": 0.01025033, + "balance_loss_clip": 1.04306769, + "balance_loss_mlp": 1.01749301, + "epoch": 0.8853484037756267, + "flos": 23295719283840.0, + "grad_norm": 1.7797861929175878, + "language_loss": 0.71842694, + "learning_rate": 1.3620001188528506e-07, + "loss": 0.74004328, + "num_input_tokens_seen": 159270875, + "step": 7363, + "time_per_iteration": 2.519090414047241 + }, + { + "auxiliary_loss_clip": 0.01153476, + "auxiliary_loss_mlp": 0.01024235, + "balance_loss_clip": 1.04402423, + "balance_loss_mlp": 1.01638174, + "epoch": 0.8854686466662659, + "flos": 25114773795840.0, + "grad_norm": 2.482945675780781, + "language_loss": 0.73656273, + "learning_rate": 1.3591760831304865e-07, + "loss": 0.75833976, + "num_input_tokens_seen": 159288565, + "step": 7364, + "time_per_iteration": 2.475139617919922 + }, + { + "auxiliary_loss_clip": 0.01163315, + "auxiliary_loss_mlp": 0.01022319, + "balance_loss_clip": 1.04613352, + "balance_loss_mlp": 1.01521993, + "epoch": 0.885588889556905, + "flos": 21390873137280.0, + "grad_norm": 2.3943866196458363, + "language_loss": 0.7963419, + "learning_rate": 1.356354875206287e-07, + "loss": 0.8181982, + "num_input_tokens_seen": 159306400, + "step": 7365, + "time_per_iteration": 2.425713300704956 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01021032, + "balance_loss_clip": 1.04411805, + "balance_loss_mlp": 1.01428771, + "epoch": 0.885709132447544, + "flos": 26906752431360.0, + "grad_norm": 2.4027470743625323, + "language_loss": 0.70194697, + "learning_rate": 1.3535364955082296e-07, + "loss": 0.72335827, + "num_input_tokens_seen": 159326250, + "step": 7366, + "time_per_iteration": 2.545585870742798 + }, + { + "auxiliary_loss_clip": 0.01161817, + "auxiliary_loss_mlp": 0.01022819, + "balance_loss_clip": 1.04649854, + "balance_loss_mlp": 1.01589322, + "epoch": 0.8858293753381832, + "flos": 26103394800000.0, + "grad_norm": 1.7741755039733227, + "language_loss": 0.64465559, + "learning_rate": 1.3507209444638613e-07, + "loss": 0.66650188, + "num_input_tokens_seen": 159348250, + "step": 7367, + "time_per_iteration": 2.4805407524108887 + }, + { + "auxiliary_loss_clip": 0.01151003, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.04648042, + "balance_loss_mlp": 1.01688075, + "epoch": 0.8859496182288222, + "flos": 23292810282240.0, + "grad_norm": 2.0168229738428836, + "language_loss": 0.74196815, + "learning_rate": 1.347908222500298e-07, + "loss": 0.76371723, + "num_input_tokens_seen": 159368325, + "step": 7368, + "time_per_iteration": 2.452939510345459 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01026246, + "balance_loss_clip": 1.04356933, + "balance_loss_mlp": 1.01949, + "epoch": 0.8860698611194613, + "flos": 16872916469760.0, + "grad_norm": 1.9161946428212586, + "language_loss": 0.69581848, + "learning_rate": 1.3450983300442276e-07, + "loss": 0.71721053, + "num_input_tokens_seen": 159387555, + "step": 7369, + "time_per_iteration": 2.5182132720947266 + }, + { + "auxiliary_loss_clip": 0.01151776, + "auxiliary_loss_mlp": 0.0102322, + "balance_loss_clip": 1.04609036, + "balance_loss_mlp": 1.01636243, + "epoch": 0.8861901040101005, + "flos": 24681404206080.0, + "grad_norm": 2.098912052055472, + "language_loss": 0.73768014, + "learning_rate": 1.3422912675219068e-07, + "loss": 0.75943005, + "num_input_tokens_seen": 159407310, + "step": 7370, + "time_per_iteration": 2.4848122596740723 + }, + { + "auxiliary_loss_clip": 0.01161121, + "auxiliary_loss_mlp": 0.01020819, + "balance_loss_clip": 1.04725075, + "balance_loss_mlp": 1.01467657, + "epoch": 0.8863103469007395, + "flos": 24423026699520.0, + "grad_norm": 1.51897907599744, + "language_loss": 0.79161704, + "learning_rate": 1.339487035359166e-07, + "loss": 0.81343639, + "num_input_tokens_seen": 159427680, + "step": 7371, + "time_per_iteration": 2.488556385040283 + }, + { + "auxiliary_loss_clip": 0.0113892, + "auxiliary_loss_mlp": 0.00761439, + "balance_loss_clip": 1.04668522, + "balance_loss_mlp": 1.00043988, + "epoch": 0.8864305897913786, + "flos": 22053964158720.0, + "grad_norm": 1.5583586568860697, + "language_loss": 0.84675568, + "learning_rate": 1.336685633981409e-07, + "loss": 0.86575925, + "num_input_tokens_seen": 159448765, + "step": 7372, + "time_per_iteration": 2.502751111984253 + }, + { + "auxiliary_loss_clip": 0.01152084, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.04411387, + "balance_loss_mlp": 1.01605463, + "epoch": 0.8865508326820177, + "flos": 19099449843840.0, + "grad_norm": 1.759198767596091, + "language_loss": 0.75007427, + "learning_rate": 1.333887063813597e-07, + "loss": 0.77182746, + "num_input_tokens_seen": 159466870, + "step": 7373, + "time_per_iteration": 2.4545724391937256 + }, + { + "auxiliary_loss_clip": 0.01136512, + "auxiliary_loss_mlp": 0.01019296, + "balance_loss_clip": 1.0419085, + "balance_loss_mlp": 1.01268888, + "epoch": 0.8866710755726568, + "flos": 15414189240960.0, + "grad_norm": 1.8081956465789568, + "language_loss": 0.65971494, + "learning_rate": 1.331091325280278e-07, + "loss": 0.68127298, + "num_input_tokens_seen": 159485840, + "step": 7374, + "time_per_iteration": 2.477057933807373 + }, + { + "auxiliary_loss_clip": 0.01101656, + "auxiliary_loss_mlp": 0.01024392, + "balance_loss_clip": 1.04003167, + "balance_loss_mlp": 1.01719809, + "epoch": 0.8867913184632958, + "flos": 20083689388800.0, + "grad_norm": 1.629645969604547, + "language_loss": 0.78244472, + "learning_rate": 1.3282984188055625e-07, + "loss": 0.80370522, + "num_input_tokens_seen": 159505630, + "step": 7375, + "time_per_iteration": 4.1883156299591064 + }, + { + "auxiliary_loss_clip": 0.01162492, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.04465771, + "balance_loss_mlp": 1.02260971, + "epoch": 0.8869115613539349, + "flos": 23365852588800.0, + "grad_norm": 1.8117834368326777, + "language_loss": 0.79558909, + "learning_rate": 1.3255083448131288e-07, + "loss": 0.81750679, + "num_input_tokens_seen": 159524675, + "step": 7376, + "time_per_iteration": 2.4247028827667236 + }, + { + "auxiliary_loss_clip": 0.01152487, + "auxiliary_loss_mlp": 0.010239, + "balance_loss_clip": 1.04326463, + "balance_loss_mlp": 1.01706338, + "epoch": 0.8870318042445741, + "flos": 21286840371840.0, + "grad_norm": 1.9723421291595973, + "language_loss": 0.78709328, + "learning_rate": 1.3227211037262365e-07, + "loss": 0.80885714, + "num_input_tokens_seen": 159541915, + "step": 7377, + "time_per_iteration": 3.265744924545288 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.03944755, + "balance_loss_mlp": 1.01971054, + "epoch": 0.8871520471352131, + "flos": 20010862563840.0, + "grad_norm": 2.6128971238669627, + "language_loss": 0.85398757, + "learning_rate": 1.319936695967696e-07, + "loss": 0.8753612, + "num_input_tokens_seen": 159559740, + "step": 7378, + "time_per_iteration": 2.5648369789123535 + }, + { + "auxiliary_loss_clip": 0.01172738, + "auxiliary_loss_mlp": 0.01023659, + "balance_loss_clip": 1.04862738, + "balance_loss_mlp": 1.01571369, + "epoch": 0.8872722900258522, + "flos": 22601422321920.0, + "grad_norm": 2.4603420899076847, + "language_loss": 0.82059735, + "learning_rate": 1.3171551219599097e-07, + "loss": 0.84256136, + "num_input_tokens_seen": 159578265, + "step": 7379, + "time_per_iteration": 2.429309368133545 + }, + { + "auxiliary_loss_clip": 0.01166356, + "auxiliary_loss_mlp": 0.01023926, + "balance_loss_clip": 1.04935789, + "balance_loss_mlp": 1.01661849, + "epoch": 0.8873925329164913, + "flos": 22163276223360.0, + "grad_norm": 10.800634261934333, + "language_loss": 0.78081715, + "learning_rate": 1.3143763821248377e-07, + "loss": 0.80271995, + "num_input_tokens_seen": 159595350, + "step": 7380, + "time_per_iteration": 2.423342704772949 + }, + { + "auxiliary_loss_clip": 0.01162399, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.04613316, + "balance_loss_mlp": 1.01890492, + "epoch": 0.8875127758071304, + "flos": 19208223204480.0, + "grad_norm": 2.039672831047613, + "language_loss": 0.72376847, + "learning_rate": 1.3116004768840118e-07, + "loss": 0.7456485, + "num_input_tokens_seen": 159613725, + "step": 7381, + "time_per_iteration": 2.395111560821533 + }, + { + "auxiliary_loss_clip": 0.01165582, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.04605055, + "balance_loss_mlp": 1.02000403, + "epoch": 0.8876330186977694, + "flos": 18110900666880.0, + "grad_norm": 1.605186338971091, + "language_loss": 0.73751259, + "learning_rate": 1.3088274066585348e-07, + "loss": 0.75944197, + "num_input_tokens_seen": 159631335, + "step": 7382, + "time_per_iteration": 2.3889708518981934 + }, + { + "auxiliary_loss_clip": 0.01129574, + "auxiliary_loss_mlp": 0.01022226, + "balance_loss_clip": 1.04249358, + "balance_loss_mlp": 1.01566339, + "epoch": 0.8877532615884086, + "flos": 22009434272640.0, + "grad_norm": 2.141240953957513, + "language_loss": 0.90490758, + "learning_rate": 1.3060571718690749e-07, + "loss": 0.92642558, + "num_input_tokens_seen": 159648830, + "step": 7383, + "time_per_iteration": 3.247978687286377 + }, + { + "auxiliary_loss_clip": 0.01030585, + "auxiliary_loss_mlp": 0.00753116, + "balance_loss_clip": 1.00662804, + "balance_loss_mlp": 0.99996281, + "epoch": 0.8878735044790477, + "flos": 72136924346880.0, + "grad_norm": 0.740767475928398, + "language_loss": 0.56882465, + "learning_rate": 1.3032897729358805e-07, + "loss": 0.5866617, + "num_input_tokens_seen": 159709785, + "step": 7384, + "time_per_iteration": 3.098950147628784 + }, + { + "auxiliary_loss_clip": 0.01083905, + "auxiliary_loss_mlp": 0.00762132, + "balance_loss_clip": 1.03628862, + "balance_loss_mlp": 1.00048435, + "epoch": 0.8879937473696867, + "flos": 27526355061120.0, + "grad_norm": 1.8490682792470905, + "language_loss": 0.799389, + "learning_rate": 1.3005252102787645e-07, + "loss": 0.81784934, + "num_input_tokens_seen": 159728725, + "step": 7385, + "time_per_iteration": 2.620107412338257 + }, + { + "auxiliary_loss_clip": 0.01153436, + "auxiliary_loss_mlp": 0.01022024, + "balance_loss_clip": 1.04546702, + "balance_loss_mlp": 1.01478839, + "epoch": 0.8881139902603259, + "flos": 22234091886720.0, + "grad_norm": 1.5884687384779326, + "language_loss": 0.73328096, + "learning_rate": 1.297763484317105e-07, + "loss": 0.75503558, + "num_input_tokens_seen": 159747020, + "step": 7386, + "time_per_iteration": 2.456625461578369 + }, + { + "auxiliary_loss_clip": 0.0110674, + "auxiliary_loss_mlp": 0.00762024, + "balance_loss_clip": 1.03916097, + "balance_loss_mlp": 1.00045383, + "epoch": 0.888234233150965, + "flos": 20299548170880.0, + "grad_norm": 2.4678131581554066, + "language_loss": 0.7072646, + "learning_rate": 1.2950045954698551e-07, + "loss": 0.72595221, + "num_input_tokens_seen": 159764855, + "step": 7387, + "time_per_iteration": 2.5199553966522217 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01023508, + "balance_loss_clip": 1.04328775, + "balance_loss_mlp": 1.0166651, + "epoch": 0.888354476041604, + "flos": 18147996437760.0, + "grad_norm": 2.566538141348987, + "language_loss": 0.75402319, + "learning_rate": 1.2922485441555343e-07, + "loss": 0.77542597, + "num_input_tokens_seen": 159783935, + "step": 7388, + "time_per_iteration": 2.495147705078125 + }, + { + "auxiliary_loss_clip": 0.01162016, + "auxiliary_loss_mlp": 0.0102299, + "balance_loss_clip": 1.04497838, + "balance_loss_mlp": 1.01596832, + "epoch": 0.8884747189322432, + "flos": 22014282608640.0, + "grad_norm": 1.746468268030024, + "language_loss": 0.81769919, + "learning_rate": 1.2894953307922363e-07, + "loss": 0.8395493, + "num_input_tokens_seen": 159802895, + "step": 7389, + "time_per_iteration": 2.4161171913146973 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01023653, + "balance_loss_clip": 1.04299176, + "balance_loss_mlp": 1.01690578, + "epoch": 0.8885949618228822, + "flos": 19786779567360.0, + "grad_norm": 2.2308530063595366, + "language_loss": 0.84037381, + "learning_rate": 1.2867449557976208e-07, + "loss": 0.86179847, + "num_input_tokens_seen": 159820995, + "step": 7390, + "time_per_iteration": 2.4809858798980713 + }, + { + "auxiliary_loss_clip": 0.01150604, + "auxiliary_loss_mlp": 0.01023462, + "balance_loss_clip": 1.04704762, + "balance_loss_mlp": 1.01650918, + "epoch": 0.8887152047135213, + "flos": 20047599198720.0, + "grad_norm": 2.4767584532694107, + "language_loss": 0.7560463, + "learning_rate": 1.283997419588916e-07, + "loss": 0.77778697, + "num_input_tokens_seen": 159840465, + "step": 7391, + "time_per_iteration": 2.4775102138519287 + }, + { + "auxiliary_loss_clip": 0.01154936, + "auxiliary_loss_mlp": 0.01024917, + "balance_loss_clip": 1.04622376, + "balance_loss_mlp": 1.01793742, + "epoch": 0.8888354476041604, + "flos": 18588117784320.0, + "grad_norm": 2.9011225206470748, + "language_loss": 0.61898756, + "learning_rate": 1.2812527225829216e-07, + "loss": 0.64078605, + "num_input_tokens_seen": 159858690, + "step": 7392, + "time_per_iteration": 2.428845167160034 + }, + { + "auxiliary_loss_clip": 0.01158261, + "auxiliary_loss_mlp": 0.01022825, + "balance_loss_clip": 1.04874802, + "balance_loss_mlp": 1.01490343, + "epoch": 0.8889556904947995, + "flos": 21689794120320.0, + "grad_norm": 1.8064572492466093, + "language_loss": 0.76296711, + "learning_rate": 1.2785108651960052e-07, + "loss": 0.784778, + "num_input_tokens_seen": 159880325, + "step": 7393, + "time_per_iteration": 2.478649616241455 + }, + { + "auxiliary_loss_clip": 0.01154083, + "auxiliary_loss_mlp": 0.01024215, + "balance_loss_clip": 1.0462513, + "balance_loss_mlp": 1.01708913, + "epoch": 0.8890759333854386, + "flos": 27381204201600.0, + "grad_norm": 1.8777480353814533, + "language_loss": 0.80651295, + "learning_rate": 1.2757718478441094e-07, + "loss": 0.82829589, + "num_input_tokens_seen": 159901070, + "step": 7394, + "time_per_iteration": 2.494816303253174 + }, + { + "auxiliary_loss_clip": 0.01135999, + "auxiliary_loss_mlp": 0.01020188, + "balance_loss_clip": 1.04309082, + "balance_loss_mlp": 1.01359856, + "epoch": 0.8891961762760777, + "flos": 24498834353280.0, + "grad_norm": 1.698131686081044, + "language_loss": 0.77360392, + "learning_rate": 1.2730356709427302e-07, + "loss": 0.79516578, + "num_input_tokens_seen": 159919750, + "step": 7395, + "time_per_iteration": 2.5131702423095703 + }, + { + "auxiliary_loss_clip": 0.01151857, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04872251, + "balance_loss_mlp": 1.02248871, + "epoch": 0.8893164191667168, + "flos": 41499770895360.0, + "grad_norm": 1.5232726930304756, + "language_loss": 0.59716618, + "learning_rate": 1.2703023349069542e-07, + "loss": 0.61898559, + "num_input_tokens_seen": 159944600, + "step": 7396, + "time_per_iteration": 2.6196460723876953 + }, + { + "auxiliary_loss_clip": 0.01149084, + "auxiliary_loss_mlp": 0.01022189, + "balance_loss_clip": 1.04570353, + "balance_loss_mlp": 1.0152247, + "epoch": 0.8894366620573558, + "flos": 33583623120000.0, + "grad_norm": 2.018056443920704, + "language_loss": 0.61910081, + "learning_rate": 1.2675718401514223e-07, + "loss": 0.64081353, + "num_input_tokens_seen": 159968780, + "step": 7397, + "time_per_iteration": 2.5523598194122314 + }, + { + "auxiliary_loss_clip": 0.01136144, + "auxiliary_loss_mlp": 0.01023348, + "balance_loss_clip": 1.04382122, + "balance_loss_mlp": 1.01604617, + "epoch": 0.889556904947995, + "flos": 16909832672640.0, + "grad_norm": 1.876765995548863, + "language_loss": 0.74329406, + "learning_rate": 1.264844187090346e-07, + "loss": 0.764889, + "num_input_tokens_seen": 159985905, + "step": 7398, + "time_per_iteration": 2.4495253562927246 + }, + { + "auxiliary_loss_clip": 0.01130571, + "auxiliary_loss_mlp": 0.01022667, + "balance_loss_clip": 1.0420146, + "balance_loss_mlp": 1.01581502, + "epoch": 0.889677147838634, + "flos": 26030855283840.0, + "grad_norm": 1.6358880314610367, + "language_loss": 0.75075173, + "learning_rate": 1.262119376137516e-07, + "loss": 0.77228409, + "num_input_tokens_seen": 160006965, + "step": 7399, + "time_per_iteration": 2.5144355297088623 + }, + { + "auxiliary_loss_clip": 0.01140077, + "auxiliary_loss_mlp": 0.0102124, + "balance_loss_clip": 1.04162705, + "balance_loss_mlp": 1.01450479, + "epoch": 0.8897973907292731, + "flos": 26468283110400.0, + "grad_norm": 1.7458554696152109, + "language_loss": 0.85329747, + "learning_rate": 1.2593974077062707e-07, + "loss": 0.87491059, + "num_input_tokens_seen": 160028585, + "step": 7400, + "time_per_iteration": 2.476206064224243 + }, + { + "auxiliary_loss_clip": 0.01113747, + "auxiliary_loss_mlp": 0.01024551, + "balance_loss_clip": 1.04120493, + "balance_loss_mlp": 1.01775885, + "epoch": 0.8899176336199123, + "flos": 26249694894720.0, + "grad_norm": 7.562010722876379, + "language_loss": 0.63592076, + "learning_rate": 1.2566782822095423e-07, + "loss": 0.65730375, + "num_input_tokens_seen": 160048840, + "step": 7401, + "time_per_iteration": 4.126861333847046 + }, + { + "auxiliary_loss_clip": 0.01129818, + "auxiliary_loss_mlp": 0.0102283, + "balance_loss_clip": 1.04625154, + "balance_loss_mlp": 1.01578498, + "epoch": 0.8900378765105513, + "flos": 20811742156800.0, + "grad_norm": 2.8760480489099813, + "language_loss": 0.71339417, + "learning_rate": 1.2539620000598162e-07, + "loss": 0.73492062, + "num_input_tokens_seen": 160068175, + "step": 7402, + "time_per_iteration": 2.5083558559417725 + }, + { + "auxiliary_loss_clip": 0.01164046, + "auxiliary_loss_mlp": 0.01025839, + "balance_loss_clip": 1.04726446, + "balance_loss_mlp": 1.01882672, + "epoch": 0.8901581194011904, + "flos": 16472333018880.0, + "grad_norm": 1.883859289469901, + "language_loss": 0.79626667, + "learning_rate": 1.2512485616691492e-07, + "loss": 0.81816554, + "num_input_tokens_seen": 160085230, + "step": 7403, + "time_per_iteration": 3.2173001766204834 + }, + { + "auxiliary_loss_clip": 0.01125272, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.04240394, + "balance_loss_mlp": 1.02227354, + "epoch": 0.8902783622918296, + "flos": 35155253773440.0, + "grad_norm": 1.5119369748728388, + "language_loss": 0.81071472, + "learning_rate": 1.2485379674491681e-07, + "loss": 0.83226812, + "num_input_tokens_seen": 160111425, + "step": 7404, + "time_per_iteration": 2.6411101818084717 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01026599, + "balance_loss_clip": 1.0448761, + "balance_loss_mlp": 1.01967311, + "epoch": 0.8903986051824686, + "flos": 17201068145280.0, + "grad_norm": 2.6336134624395586, + "language_loss": 0.79408747, + "learning_rate": 1.2458302178110657e-07, + "loss": 0.81570572, + "num_input_tokens_seen": 160129790, + "step": 7405, + "time_per_iteration": 2.427273988723755 + }, + { + "auxiliary_loss_clip": 0.01111209, + "auxiliary_loss_mlp": 0.01019656, + "balance_loss_clip": 1.04037714, + "balance_loss_mlp": 1.01325107, + "epoch": 0.8905188480731077, + "flos": 25483863997440.0, + "grad_norm": 1.941480803882672, + "language_loss": 0.82519531, + "learning_rate": 1.2431253131656118e-07, + "loss": 0.84650397, + "num_input_tokens_seen": 160149265, + "step": 7406, + "time_per_iteration": 2.5330002307891846 + }, + { + "auxiliary_loss_clip": 0.01129471, + "auxiliary_loss_mlp": 0.01024975, + "balance_loss_clip": 1.04289412, + "balance_loss_mlp": 1.01754248, + "epoch": 0.8906390909637467, + "flos": 23365888502400.0, + "grad_norm": 3.7995477910424658, + "language_loss": 0.76877761, + "learning_rate": 1.240423253923133e-07, + "loss": 0.79032207, + "num_input_tokens_seen": 160168870, + "step": 7407, + "time_per_iteration": 2.478048086166382 + }, + { + "auxiliary_loss_clip": 0.01150761, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.0448072, + "balance_loss_mlp": 1.01868892, + "epoch": 0.8907593338543859, + "flos": 21068790860160.0, + "grad_norm": 1.8226502738910912, + "language_loss": 0.69473112, + "learning_rate": 1.237724040493533e-07, + "loss": 0.71649951, + "num_input_tokens_seen": 160187495, + "step": 7408, + "time_per_iteration": 2.421055793762207 + }, + { + "auxiliary_loss_clip": 0.01170912, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.05000138, + "balance_loss_mlp": 1.02120709, + "epoch": 0.8908795767450249, + "flos": 21869562712320.0, + "grad_norm": 3.31065782897963, + "language_loss": 0.73056811, + "learning_rate": 1.2350276732862773e-07, + "loss": 0.75256515, + "num_input_tokens_seen": 160208520, + "step": 7409, + "time_per_iteration": 2.4612503051757812 + }, + { + "auxiliary_loss_clip": 0.01052676, + "auxiliary_loss_mlp": 0.01000968, + "balance_loss_clip": 1.0082022, + "balance_loss_mlp": 1.00007427, + "epoch": 0.890999819635664, + "flos": 66307869348480.0, + "grad_norm": 0.8374006158886216, + "language_loss": 0.56670988, + "learning_rate": 1.2323341527103993e-07, + "loss": 0.5872463, + "num_input_tokens_seen": 160263720, + "step": 7410, + "time_per_iteration": 3.659691572189331 + }, + { + "auxiliary_loss_clip": 0.01162823, + "auxiliary_loss_mlp": 0.01023182, + "balance_loss_clip": 1.04615211, + "balance_loss_mlp": 1.01632476, + "epoch": 0.8911200625263032, + "flos": 26869908055680.0, + "grad_norm": 2.1003972524876815, + "language_loss": 0.85460377, + "learning_rate": 1.2296434791745135e-07, + "loss": 0.87646377, + "num_input_tokens_seen": 160282170, + "step": 7411, + "time_per_iteration": 2.4414825439453125 + }, + { + "auxiliary_loss_clip": 0.01154789, + "auxiliary_loss_mlp": 0.01026548, + "balance_loss_clip": 1.04659259, + "balance_loss_mlp": 1.01940739, + "epoch": 0.8912403054169422, + "flos": 20885825957760.0, + "grad_norm": 2.557252075666105, + "language_loss": 0.76739734, + "learning_rate": 1.2269556530867875e-07, + "loss": 0.78921074, + "num_input_tokens_seen": 160300725, + "step": 7412, + "time_per_iteration": 2.450429677963257 + }, + { + "auxiliary_loss_clip": 0.01170689, + "auxiliary_loss_mlp": 0.01025854, + "balance_loss_clip": 1.04911482, + "balance_loss_mlp": 1.01763499, + "epoch": 0.8913605483075813, + "flos": 27016567286400.0, + "grad_norm": 2.022584928617387, + "language_loss": 0.82289821, + "learning_rate": 1.2242706748549614e-07, + "loss": 0.84486365, + "num_input_tokens_seen": 160318720, + "step": 7413, + "time_per_iteration": 2.451805830001831 + }, + { + "auxiliary_loss_clip": 0.01136557, + "auxiliary_loss_mlp": 0.01019489, + "balance_loss_clip": 1.04030132, + "balance_loss_mlp": 1.01242292, + "epoch": 0.8914807911982204, + "flos": 23621500661760.0, + "grad_norm": 1.7612669472426261, + "language_loss": 0.82317173, + "learning_rate": 1.2215885448863473e-07, + "loss": 0.84473217, + "num_input_tokens_seen": 160339595, + "step": 7414, + "time_per_iteration": 2.521181106567383 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01025779, + "balance_loss_clip": 1.04524505, + "balance_loss_mlp": 1.01901364, + "epoch": 0.8916010340888595, + "flos": 24462277286400.0, + "grad_norm": 1.9553151334583236, + "language_loss": 0.8039254, + "learning_rate": 1.2189092635878152e-07, + "loss": 0.82555199, + "num_input_tokens_seen": 160361045, + "step": 7415, + "time_per_iteration": 2.5448262691497803 + }, + { + "auxiliary_loss_clip": 0.01113019, + "auxiliary_loss_mlp": 0.01023014, + "balance_loss_clip": 1.0408926, + "balance_loss_mlp": 1.01594472, + "epoch": 0.8917212769794985, + "flos": 21215773313280.0, + "grad_norm": 1.9104440364754791, + "language_loss": 0.7771312, + "learning_rate": 1.216232831365822e-07, + "loss": 0.7984916, + "num_input_tokens_seen": 160379990, + "step": 7416, + "time_per_iteration": 2.550075054168701 + }, + { + "auxiliary_loss_clip": 0.01143079, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.04576087, + "balance_loss_mlp": 1.01917338, + "epoch": 0.8918415198701377, + "flos": 25513992529920.0, + "grad_norm": 1.9466832616763416, + "language_loss": 0.80975842, + "learning_rate": 1.2135592486263678e-07, + "loss": 0.83145499, + "num_input_tokens_seen": 160399240, + "step": 7417, + "time_per_iteration": 2.5348081588745117 + }, + { + "auxiliary_loss_clip": 0.01135294, + "auxiliary_loss_mlp": 0.01025101, + "balance_loss_clip": 1.042979, + "balance_loss_mlp": 1.01838028, + "epoch": 0.8919617627607768, + "flos": 37853006693760.0, + "grad_norm": 1.6980455305116617, + "language_loss": 0.61257374, + "learning_rate": 1.2108885157750415e-07, + "loss": 0.63417768, + "num_input_tokens_seen": 160421600, + "step": 7418, + "time_per_iteration": 2.607163190841675 + }, + { + "auxiliary_loss_clip": 0.01118372, + "auxiliary_loss_mlp": 0.00761162, + "balance_loss_clip": 1.04372573, + "balance_loss_mlp": 1.00039625, + "epoch": 0.8920820056514158, + "flos": 26213676531840.0, + "grad_norm": 1.8163406681216865, + "language_loss": 0.79795241, + "learning_rate": 1.2082206332169897e-07, + "loss": 0.81674778, + "num_input_tokens_seen": 160441695, + "step": 7419, + "time_per_iteration": 2.5493650436401367 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.04627526, + "balance_loss_mlp": 1.02039957, + "epoch": 0.892202248542055, + "flos": 17383135207680.0, + "grad_norm": 4.941490706673702, + "language_loss": 0.73491853, + "learning_rate": 1.2055556013569225e-07, + "loss": 0.75653541, + "num_input_tokens_seen": 160457205, + "step": 7420, + "time_per_iteration": 2.424400568008423 + }, + { + "auxiliary_loss_clip": 0.0113947, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.04440367, + "balance_loss_mlp": 1.01654625, + "epoch": 0.892322491432694, + "flos": 21324223451520.0, + "grad_norm": 1.8439221813257884, + "language_loss": 0.82086712, + "learning_rate": 1.2028934205991315e-07, + "loss": 0.84249634, + "num_input_tokens_seen": 160476525, + "step": 7421, + "time_per_iteration": 2.496135950088501 + }, + { + "auxiliary_loss_clip": 0.01149567, + "auxiliary_loss_mlp": 0.01021889, + "balance_loss_clip": 1.04336703, + "balance_loss_mlp": 1.01447713, + "epoch": 0.8924427343233331, + "flos": 24029374573440.0, + "grad_norm": 1.7799896893532616, + "language_loss": 0.76575053, + "learning_rate": 1.2002340913474607e-07, + "loss": 0.7874651, + "num_input_tokens_seen": 160500160, + "step": 7422, + "time_per_iteration": 2.5181031227111816 + }, + { + "auxiliary_loss_clip": 0.01165782, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.04667592, + "balance_loss_mlp": 1.02164483, + "epoch": 0.8925629772139723, + "flos": 30008069631360.0, + "grad_norm": 11.415048182270004, + "language_loss": 0.74153155, + "learning_rate": 1.1975776140053317e-07, + "loss": 0.76348078, + "num_input_tokens_seen": 160520130, + "step": 7423, + "time_per_iteration": 2.4831578731536865 + }, + { + "auxiliary_loss_clip": 0.01111058, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.04205298, + "balance_loss_mlp": 1.01827919, + "epoch": 0.8926832201046113, + "flos": 22601709630720.0, + "grad_norm": 1.8767466371757187, + "language_loss": 0.73635966, + "learning_rate": 1.194923988975729e-07, + "loss": 0.75772768, + "num_input_tokens_seen": 160539730, + "step": 7424, + "time_per_iteration": 2.5377964973449707 + }, + { + "auxiliary_loss_clip": 0.01119216, + "auxiliary_loss_mlp": 0.01020395, + "balance_loss_clip": 1.04280376, + "balance_loss_mlp": 1.01238418, + "epoch": 0.8928034629952504, + "flos": 13297722117120.0, + "grad_norm": 2.266639274637357, + "language_loss": 0.74034125, + "learning_rate": 1.192273216661206e-07, + "loss": 0.76173735, + "num_input_tokens_seen": 160557820, + "step": 7425, + "time_per_iteration": 2.517336130142212 + }, + { + "auxiliary_loss_clip": 0.01011198, + "auxiliary_loss_mlp": 0.01000785, + "balance_loss_clip": 1.00823212, + "balance_loss_mlp": 0.99978322, + "epoch": 0.8929237058858895, + "flos": 54854556744960.0, + "grad_norm": 0.774418217978881, + "language_loss": 0.57487988, + "learning_rate": 1.189625297463881e-07, + "loss": 0.59499979, + "num_input_tokens_seen": 160619510, + "step": 7426, + "time_per_iteration": 3.2216598987579346 + }, + { + "auxiliary_loss_clip": 0.01089477, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.03863728, + "balance_loss_mlp": 1.01832664, + "epoch": 0.8930439487765286, + "flos": 28883850785280.0, + "grad_norm": 1.5840368515953966, + "language_loss": 0.79691482, + "learning_rate": 1.1869802317854394e-07, + "loss": 0.81805998, + "num_input_tokens_seen": 160643295, + "step": 7427, + "time_per_iteration": 3.826235055923462 + }, + { + "auxiliary_loss_clip": 0.01113734, + "auxiliary_loss_mlp": 0.01023256, + "balance_loss_clip": 1.04221451, + "balance_loss_mlp": 1.0160886, + "epoch": 0.8931641916671677, + "flos": 22419283432320.0, + "grad_norm": 1.9062757888674386, + "language_loss": 0.71996498, + "learning_rate": 1.1843380200271425e-07, + "loss": 0.74133492, + "num_input_tokens_seen": 160662495, + "step": 7428, + "time_per_iteration": 3.3114731311798096 + }, + { + "auxiliary_loss_clip": 0.01115727, + "auxiliary_loss_mlp": 0.01018786, + "balance_loss_clip": 1.04142535, + "balance_loss_mlp": 1.01155257, + "epoch": 0.8932844345578068, + "flos": 25843149786240.0, + "grad_norm": 2.1047395284678694, + "language_loss": 0.80183971, + "learning_rate": 1.181698662589805e-07, + "loss": 0.82318485, + "num_input_tokens_seen": 160682080, + "step": 7429, + "time_per_iteration": 3.39693284034729 + }, + { + "auxiliary_loss_clip": 0.01150058, + "auxiliary_loss_mlp": 0.01025833, + "balance_loss_clip": 1.04470217, + "balance_loss_mlp": 1.01841831, + "epoch": 0.8934046774484459, + "flos": 22925803069440.0, + "grad_norm": 2.0712411145507152, + "language_loss": 0.7615875, + "learning_rate": 1.1790621598738249e-07, + "loss": 0.78334641, + "num_input_tokens_seen": 160700395, + "step": 7430, + "time_per_iteration": 2.5288875102996826 + }, + { + "auxiliary_loss_clip": 0.01162498, + "auxiliary_loss_mlp": 0.01023594, + "balance_loss_clip": 1.04783988, + "balance_loss_mlp": 1.01703143, + "epoch": 0.8935249203390849, + "flos": 24462097718400.0, + "grad_norm": 2.0024294129715092, + "language_loss": 0.75163984, + "learning_rate": 1.1764285122791461e-07, + "loss": 0.77350074, + "num_input_tokens_seen": 160721115, + "step": 7431, + "time_per_iteration": 2.4550232887268066 + }, + { + "auxiliary_loss_clip": 0.01148549, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.04208839, + "balance_loss_mlp": 1.01517642, + "epoch": 0.8936451632297241, + "flos": 15742735966080.0, + "grad_norm": 2.513241487722935, + "language_loss": 0.77320063, + "learning_rate": 1.173797720205294e-07, + "loss": 0.79490674, + "num_input_tokens_seen": 160739150, + "step": 7432, + "time_per_iteration": 2.4107601642608643 + }, + { + "auxiliary_loss_clip": 0.0115387, + "auxiliary_loss_mlp": 0.0102771, + "balance_loss_clip": 1.04704237, + "balance_loss_mlp": 1.0200119, + "epoch": 0.8937654061203631, + "flos": 35115500396160.0, + "grad_norm": 3.5361902286749323, + "language_loss": 0.71541786, + "learning_rate": 1.1711697840513602e-07, + "loss": 0.73723364, + "num_input_tokens_seen": 160758585, + "step": 7433, + "time_per_iteration": 2.547919273376465 + }, + { + "auxiliary_loss_clip": 0.01142744, + "auxiliary_loss_mlp": 0.01022179, + "balance_loss_clip": 1.04243076, + "balance_loss_mlp": 1.01492846, + "epoch": 0.8938856490110022, + "flos": 16107444708480.0, + "grad_norm": 2.8434040265186833, + "language_loss": 0.71168488, + "learning_rate": 1.1685447042160012e-07, + "loss": 0.73333406, + "num_input_tokens_seen": 160776620, + "step": 7434, + "time_per_iteration": 2.403050184249878 + }, + { + "auxiliary_loss_clip": 0.01166433, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.0472002, + "balance_loss_mlp": 1.02115083, + "epoch": 0.8940058919016414, + "flos": 20704189858560.0, + "grad_norm": 2.2754156304127213, + "language_loss": 0.71512991, + "learning_rate": 1.1659224810974367e-07, + "loss": 0.73707676, + "num_input_tokens_seen": 160796580, + "step": 7435, + "time_per_iteration": 2.403956413269043 + }, + { + "auxiliary_loss_clip": 0.01138612, + "auxiliary_loss_mlp": 0.0102581, + "balance_loss_clip": 1.047207, + "balance_loss_mlp": 1.01860118, + "epoch": 0.8941261347922804, + "flos": 25229041937280.0, + "grad_norm": 2.186319305615022, + "language_loss": 0.68190753, + "learning_rate": 1.1633031150934591e-07, + "loss": 0.70355177, + "num_input_tokens_seen": 160819610, + "step": 7436, + "time_per_iteration": 2.550081491470337 + }, + { + "auxiliary_loss_clip": 0.0115303, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.04694581, + "balance_loss_mlp": 1.02384853, + "epoch": 0.8942463776829195, + "flos": 19537236806400.0, + "grad_norm": 2.0891362424621764, + "language_loss": 0.80209744, + "learning_rate": 1.1606866066014176e-07, + "loss": 0.82394052, + "num_input_tokens_seen": 160838660, + "step": 7437, + "time_per_iteration": 3.192620277404785 + }, + { + "auxiliary_loss_clip": 0.0111977, + "auxiliary_loss_mlp": 0.01022264, + "balance_loss_clip": 1.04246688, + "balance_loss_mlp": 1.01493287, + "epoch": 0.8943666205735585, + "flos": 22301567585280.0, + "grad_norm": 2.570170749072981, + "language_loss": 0.75530273, + "learning_rate": 1.1580729560182434e-07, + "loss": 0.77672309, + "num_input_tokens_seen": 160854515, + "step": 7438, + "time_per_iteration": 2.481064558029175 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.00761766, + "balance_loss_clip": 1.0469346, + "balance_loss_mlp": 1.0004034, + "epoch": 0.8944868634641977, + "flos": 18912893581440.0, + "grad_norm": 2.0853067501221054, + "language_loss": 0.70763063, + "learning_rate": 1.1554621637404171e-07, + "loss": 0.72689784, + "num_input_tokens_seen": 160872605, + "step": 7439, + "time_per_iteration": 2.3844475746154785 + }, + { + "auxiliary_loss_clip": 0.0115437, + "auxiliary_loss_mlp": 0.01019545, + "balance_loss_clip": 1.04668391, + "balance_loss_mlp": 1.01252007, + "epoch": 0.8946071063548368, + "flos": 14460904241280.0, + "grad_norm": 3.424561241713183, + "language_loss": 0.61309856, + "learning_rate": 1.1528542301639999e-07, + "loss": 0.63483769, + "num_input_tokens_seen": 160889395, + "step": 7440, + "time_per_iteration": 2.398780584335327 + }, + { + "auxiliary_loss_clip": 0.01124041, + "auxiliary_loss_mlp": 0.01020349, + "balance_loss_clip": 1.04062712, + "balance_loss_mlp": 1.01352429, + "epoch": 0.8947273492454758, + "flos": 20084084438400.0, + "grad_norm": 2.939457369216438, + "language_loss": 0.82780707, + "learning_rate": 1.1502491556846105e-07, + "loss": 0.84925097, + "num_input_tokens_seen": 160907890, + "step": 7441, + "time_per_iteration": 2.5046281814575195 + }, + { + "auxiliary_loss_clip": 0.01136078, + "auxiliary_loss_mlp": 0.01023901, + "balance_loss_clip": 1.0440073, + "balance_loss_mlp": 1.01705587, + "epoch": 0.894847592136115, + "flos": 18550555136640.0, + "grad_norm": 2.7401060281557363, + "language_loss": 0.81608999, + "learning_rate": 1.1476469406974331e-07, + "loss": 0.83768976, + "num_input_tokens_seen": 160923490, + "step": 7442, + "time_per_iteration": 2.4371020793914795 + }, + { + "auxiliary_loss_clip": 0.01163353, + "auxiliary_loss_mlp": 0.01025048, + "balance_loss_clip": 1.04734921, + "balance_loss_mlp": 1.01837802, + "epoch": 0.894967835026754, + "flos": 23478468704640.0, + "grad_norm": 1.7182839172039632, + "language_loss": 0.77063191, + "learning_rate": 1.1450475855972341e-07, + "loss": 0.79251593, + "num_input_tokens_seen": 160944280, + "step": 7443, + "time_per_iteration": 2.4304957389831543 + }, + { + "auxiliary_loss_clip": 0.01135476, + "auxiliary_loss_mlp": 0.00762009, + "balance_loss_clip": 1.04205918, + "balance_loss_mlp": 1.00045168, + "epoch": 0.8950880779173931, + "flos": 15188310564480.0, + "grad_norm": 3.074678388272662, + "language_loss": 0.71294785, + "learning_rate": 1.1424510907783158e-07, + "loss": 0.73192275, + "num_input_tokens_seen": 160961560, + "step": 7444, + "time_per_iteration": 2.4330248832702637 + }, + { + "auxiliary_loss_clip": 0.01140329, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.04253697, + "balance_loss_mlp": 1.01941133, + "epoch": 0.8952083208080323, + "flos": 22091957769600.0, + "grad_norm": 1.5495937147365109, + "language_loss": 0.82725537, + "learning_rate": 1.1398574566345787e-07, + "loss": 0.84892035, + "num_input_tokens_seen": 160982195, + "step": 7445, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01141352, + "auxiliary_loss_mlp": 0.01022911, + "balance_loss_clip": 1.04245603, + "balance_loss_mlp": 1.0154959, + "epoch": 0.8953285636986713, + "flos": 23254026572160.0, + "grad_norm": 2.426159385548367, + "language_loss": 0.82391077, + "learning_rate": 1.1372666835594702e-07, + "loss": 0.8455534, + "num_input_tokens_seen": 161000520, + "step": 7446, + "time_per_iteration": 2.4716262817382812 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.0101974, + "balance_loss_clip": 1.04381502, + "balance_loss_mlp": 1.01338887, + "epoch": 0.8954488065893104, + "flos": 16362661818240.0, + "grad_norm": 2.000137099003427, + "language_loss": 0.71833777, + "learning_rate": 1.1346787719460071e-07, + "loss": 0.73988569, + "num_input_tokens_seen": 161019405, + "step": 7447, + "time_per_iteration": 2.4457385540008545 + }, + { + "auxiliary_loss_clip": 0.01136824, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.04591799, + "balance_loss_mlp": 1.01820016, + "epoch": 0.8955690494799495, + "flos": 18257883120000.0, + "grad_norm": 1.7287385889543345, + "language_loss": 0.71899825, + "learning_rate": 1.1320937221867732e-07, + "loss": 0.7406168, + "num_input_tokens_seen": 161036985, + "step": 7448, + "time_per_iteration": 2.464522361755371 + }, + { + "auxiliary_loss_clip": 0.01134856, + "auxiliary_loss_mlp": 0.01024953, + "balance_loss_clip": 1.04284477, + "balance_loss_mlp": 1.01863527, + "epoch": 0.8956892923705886, + "flos": 25447486498560.0, + "grad_norm": 1.7557887997380401, + "language_loss": 0.79429418, + "learning_rate": 1.1295115346739192e-07, + "loss": 0.81589228, + "num_input_tokens_seen": 161056985, + "step": 7449, + "time_per_iteration": 2.5046324729919434 + }, + { + "auxiliary_loss_clip": 0.01140015, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.04442084, + "balance_loss_mlp": 1.01857412, + "epoch": 0.8958095352612276, + "flos": 52661883939840.0, + "grad_norm": 5.6498850965287355, + "language_loss": 0.72896147, + "learning_rate": 1.1269322097991629e-07, + "loss": 0.75061899, + "num_input_tokens_seen": 161080270, + "step": 7450, + "time_per_iteration": 2.739476203918457 + }, + { + "auxiliary_loss_clip": 0.0115662, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.0491457, + "balance_loss_mlp": 1.0177176, + "epoch": 0.8959297781518668, + "flos": 23186335392000.0, + "grad_norm": 2.5594880084113467, + "language_loss": 0.67722237, + "learning_rate": 1.1243557479537846e-07, + "loss": 0.69904232, + "num_input_tokens_seen": 161100160, + "step": 7451, + "time_per_iteration": 2.4521005153656006 + }, + { + "auxiliary_loss_clip": 0.01162095, + "auxiliary_loss_mlp": 0.01019687, + "balance_loss_clip": 1.04424345, + "balance_loss_mlp": 1.01251972, + "epoch": 0.8960500210425059, + "flos": 20334309557760.0, + "grad_norm": 2.1946308636044383, + "language_loss": 0.68573368, + "learning_rate": 1.121782149528634e-07, + "loss": 0.70755148, + "num_input_tokens_seen": 161117260, + "step": 7452, + "time_per_iteration": 2.383960485458374 + }, + { + "auxiliary_loss_clip": 0.01142041, + "auxiliary_loss_mlp": 0.01020068, + "balance_loss_clip": 1.04624486, + "balance_loss_mlp": 1.01320195, + "epoch": 0.8961702639331449, + "flos": 19901694153600.0, + "grad_norm": 2.1996692613356705, + "language_loss": 0.78960574, + "learning_rate": 1.1192114149141208e-07, + "loss": 0.81122684, + "num_input_tokens_seen": 161136895, + "step": 7453, + "time_per_iteration": 2.463416576385498 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.04319167, + "balance_loss_mlp": 1.01975524, + "epoch": 0.8962905068237841, + "flos": 12896348567040.0, + "grad_norm": 3.0133785671726088, + "language_loss": 0.6505838, + "learning_rate": 1.1166435445002197e-07, + "loss": 0.67226481, + "num_input_tokens_seen": 161154565, + "step": 7454, + "time_per_iteration": 3.442310333251953 + }, + { + "auxiliary_loss_clip": 0.0115446, + "auxiliary_loss_mlp": 0.01026608, + "balance_loss_clip": 1.04701924, + "balance_loss_mlp": 1.01890111, + "epoch": 0.8964107497144231, + "flos": 23440331439360.0, + "grad_norm": 2.124418998365666, + "language_loss": 0.68864495, + "learning_rate": 1.1140785386764818e-07, + "loss": 0.71045566, + "num_input_tokens_seen": 161173265, + "step": 7455, + "time_per_iteration": 3.191255807876587 + }, + { + "auxiliary_loss_clip": 0.0114625, + "auxiliary_loss_mlp": 0.01027711, + "balance_loss_clip": 1.04445922, + "balance_loss_mlp": 1.02039468, + "epoch": 0.8965309926050622, + "flos": 19500176949120.0, + "grad_norm": 2.1232624020422164, + "language_loss": 0.69580305, + "learning_rate": 1.1115163978320153e-07, + "loss": 0.71754265, + "num_input_tokens_seen": 161191995, + "step": 7456, + "time_per_iteration": 3.2421813011169434 + }, + { + "auxiliary_loss_clip": 0.01156426, + "auxiliary_loss_mlp": 0.00762208, + "balance_loss_clip": 1.04639804, + "balance_loss_mlp": 1.00046539, + "epoch": 0.8966512354957014, + "flos": 28658008022400.0, + "grad_norm": 1.8422661718315823, + "language_loss": 0.82507384, + "learning_rate": 1.1089571223554917e-07, + "loss": 0.84426022, + "num_input_tokens_seen": 161212880, + "step": 7457, + "time_per_iteration": 2.504429340362549 + }, + { + "auxiliary_loss_clip": 0.01151697, + "auxiliary_loss_mlp": 0.01024351, + "balance_loss_clip": 1.04404759, + "balance_loss_mlp": 1.01708531, + "epoch": 0.8967714783863404, + "flos": 23370916406400.0, + "grad_norm": 2.0165449609390267, + "language_loss": 0.85309386, + "learning_rate": 1.1064007126351537e-07, + "loss": 0.87485433, + "num_input_tokens_seen": 161233595, + "step": 7458, + "time_per_iteration": 2.4564974308013916 + }, + { + "auxiliary_loss_clip": 0.01131771, + "auxiliary_loss_mlp": 0.01021595, + "balance_loss_clip": 1.04406762, + "balance_loss_mlp": 1.01449597, + "epoch": 0.8968917212769795, + "flos": 24535175938560.0, + "grad_norm": 2.126901697992628, + "language_loss": 0.76233041, + "learning_rate": 1.1038471690588003e-07, + "loss": 0.78386408, + "num_input_tokens_seen": 161252740, + "step": 7459, + "time_per_iteration": 2.479123592376709 + }, + { + "auxiliary_loss_clip": 0.01112005, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.04642045, + "balance_loss_mlp": 1.01862216, + "epoch": 0.8970119641676186, + "flos": 23475416048640.0, + "grad_norm": 2.3379079766910373, + "language_loss": 0.79907155, + "learning_rate": 1.1012964920138145e-07, + "loss": 0.82044905, + "num_input_tokens_seen": 161272325, + "step": 7460, + "time_per_iteration": 2.550973892211914 + }, + { + "auxiliary_loss_clip": 0.01130086, + "auxiliary_loss_mlp": 0.01024242, + "balance_loss_clip": 1.03987694, + "balance_loss_mlp": 1.01748919, + "epoch": 0.8971322070582577, + "flos": 24538192680960.0, + "grad_norm": 1.8724472127650784, + "language_loss": 0.75860697, + "learning_rate": 1.0987486818871205e-07, + "loss": 0.78015023, + "num_input_tokens_seen": 161295915, + "step": 7461, + "time_per_iteration": 2.5437662601470947 + }, + { + "auxiliary_loss_clip": 0.01151035, + "auxiliary_loss_mlp": 0.00761777, + "balance_loss_clip": 1.04521227, + "balance_loss_mlp": 1.00043297, + "epoch": 0.8972524499488967, + "flos": 21797454159360.0, + "grad_norm": 2.5814274669859403, + "language_loss": 0.73589694, + "learning_rate": 1.0962037390652245e-07, + "loss": 0.75502509, + "num_input_tokens_seen": 161314935, + "step": 7462, + "time_per_iteration": 2.455146551132202 + }, + { + "auxiliary_loss_clip": 0.01138681, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.04517114, + "balance_loss_mlp": 1.01901698, + "epoch": 0.8973726928395359, + "flos": 21726243446400.0, + "grad_norm": 1.785756433750136, + "language_loss": 0.71740371, + "learning_rate": 1.0936616639341911e-07, + "loss": 0.73905522, + "num_input_tokens_seen": 161335225, + "step": 7463, + "time_per_iteration": 2.479999303817749 + }, + { + "auxiliary_loss_clip": 0.01046846, + "auxiliary_loss_mlp": 0.01002208, + "balance_loss_clip": 1.0100919, + "balance_loss_mlp": 1.00145423, + "epoch": 0.897492935730175, + "flos": 53837100097920.0, + "grad_norm": 0.7421691002946087, + "language_loss": 0.54748058, + "learning_rate": 1.0911224568796473e-07, + "loss": 0.56797123, + "num_input_tokens_seen": 161393420, + "step": 7464, + "time_per_iteration": 3.8243470191955566 + }, + { + "auxiliary_loss_clip": 0.01149507, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.04633892, + "balance_loss_mlp": 1.02052569, + "epoch": 0.897613178620814, + "flos": 18290346036480.0, + "grad_norm": 3.4304111658655776, + "language_loss": 0.70918477, + "learning_rate": 1.0885861182867984e-07, + "loss": 0.73095441, + "num_input_tokens_seen": 161411525, + "step": 7465, + "time_per_iteration": 2.428588628768921 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01022244, + "balance_loss_clip": 1.04350233, + "balance_loss_mlp": 1.01524067, + "epoch": 0.8977334215114532, + "flos": 32993718059520.0, + "grad_norm": 2.1091594877299604, + "language_loss": 0.70983779, + "learning_rate": 1.0860526485403942e-07, + "loss": 0.73146021, + "num_input_tokens_seen": 161432800, + "step": 7466, + "time_per_iteration": 2.5627036094665527 + }, + { + "auxiliary_loss_clip": 0.01164526, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.0464983, + "balance_loss_mlp": 1.01779079, + "epoch": 0.8978536644020922, + "flos": 15195636938880.0, + "grad_norm": 1.668742687822235, + "language_loss": 0.77024531, + "learning_rate": 1.0835220480247675e-07, + "loss": 0.79213357, + "num_input_tokens_seen": 161451295, + "step": 7467, + "time_per_iteration": 2.377856492996216 + }, + { + "auxiliary_loss_clip": 0.0113386, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.04409337, + "balance_loss_mlp": 1.0176661, + "epoch": 0.8979739072927313, + "flos": 18004389863040.0, + "grad_norm": 2.1487084897035733, + "language_loss": 0.8361212, + "learning_rate": 1.0809943171238067e-07, + "loss": 0.85770667, + "num_input_tokens_seen": 161469220, + "step": 7468, + "time_per_iteration": 2.4430463314056396 + }, + { + "auxiliary_loss_clip": 0.01144633, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.04544282, + "balance_loss_mlp": 1.02128887, + "epoch": 0.8980941501833704, + "flos": 22271546793600.0, + "grad_norm": 2.3138926097217705, + "language_loss": 0.62957704, + "learning_rate": 1.078469456220965e-07, + "loss": 0.65131873, + "num_input_tokens_seen": 161489375, + "step": 7469, + "time_per_iteration": 2.4690537452697754 + }, + { + "auxiliary_loss_clip": 0.01150439, + "auxiliary_loss_mlp": 0.01024784, + "balance_loss_clip": 1.04356158, + "balance_loss_mlp": 1.01764941, + "epoch": 0.8982143930740095, + "flos": 37560729726720.0, + "grad_norm": 2.2147316442639955, + "language_loss": 0.69598716, + "learning_rate": 1.0759474656992606e-07, + "loss": 0.71773946, + "num_input_tokens_seen": 161512145, + "step": 7470, + "time_per_iteration": 2.5764031410217285 + }, + { + "auxiliary_loss_clip": 0.01140976, + "auxiliary_loss_mlp": 0.0102563, + "balance_loss_clip": 1.04351974, + "balance_loss_mlp": 1.01827812, + "epoch": 0.8983346359646486, + "flos": 18076893465600.0, + "grad_norm": 2.144831081515994, + "language_loss": 0.77924454, + "learning_rate": 1.0734283459412785e-07, + "loss": 0.80091059, + "num_input_tokens_seen": 161528995, + "step": 7471, + "time_per_iteration": 2.447983741760254 + }, + { + "auxiliary_loss_clip": 0.01111957, + "auxiliary_loss_mlp": 0.01026433, + "balance_loss_clip": 1.04047716, + "balance_loss_mlp": 1.01849985, + "epoch": 0.8984548788552876, + "flos": 20558895344640.0, + "grad_norm": 1.855930994033701, + "language_loss": 0.80479497, + "learning_rate": 1.0709120973291707e-07, + "loss": 0.82617891, + "num_input_tokens_seen": 161548775, + "step": 7472, + "time_per_iteration": 2.5386130809783936 + }, + { + "auxiliary_loss_clip": 0.01166358, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.04730916, + "balance_loss_mlp": 1.01941466, + "epoch": 0.8985751217459268, + "flos": 17785442511360.0, + "grad_norm": 2.0362015033547927, + "language_loss": 0.78058845, + "learning_rate": 1.0683987202446475e-07, + "loss": 0.80252075, + "num_input_tokens_seen": 161566960, + "step": 7473, + "time_per_iteration": 2.4053902626037598 + }, + { + "auxiliary_loss_clip": 0.01153978, + "auxiliary_loss_mlp": 0.01022581, + "balance_loss_clip": 1.04515266, + "balance_loss_mlp": 1.01513028, + "epoch": 0.8986953646365659, + "flos": 21617003208960.0, + "grad_norm": 1.873774541766139, + "language_loss": 0.69974357, + "learning_rate": 1.0658882150689862e-07, + "loss": 0.72150916, + "num_input_tokens_seen": 161585820, + "step": 7474, + "time_per_iteration": 2.445111036300659 + }, + { + "auxiliary_loss_clip": 0.01127501, + "auxiliary_loss_mlp": 0.01021578, + "balance_loss_clip": 1.04371154, + "balance_loss_mlp": 1.01436627, + "epoch": 0.8988156075272049, + "flos": 14027355083520.0, + "grad_norm": 2.9974118179394447, + "language_loss": 0.78034985, + "learning_rate": 1.0633805821830288e-07, + "loss": 0.80184066, + "num_input_tokens_seen": 161602505, + "step": 7475, + "time_per_iteration": 2.4873292446136475 + }, + { + "auxiliary_loss_clip": 0.01139418, + "auxiliary_loss_mlp": 0.01025284, + "balance_loss_clip": 1.04479635, + "balance_loss_mlp": 1.01764822, + "epoch": 0.8989358504178441, + "flos": 29059202004480.0, + "grad_norm": 6.770366517517771, + "language_loss": 0.82814127, + "learning_rate": 1.0608758219671753e-07, + "loss": 0.84978831, + "num_input_tokens_seen": 161621545, + "step": 7476, + "time_per_iteration": 2.529716968536377 + }, + { + "auxiliary_loss_clip": 0.01141771, + "auxiliary_loss_mlp": 0.01021794, + "balance_loss_clip": 1.04437053, + "balance_loss_mlp": 1.01490045, + "epoch": 0.8990560933084831, + "flos": 20230420446720.0, + "grad_norm": 1.6617175867881215, + "language_loss": 0.70759833, + "learning_rate": 1.0583739348014065e-07, + "loss": 0.72923398, + "num_input_tokens_seen": 161642630, + "step": 7477, + "time_per_iteration": 2.4870245456695557 + }, + { + "auxiliary_loss_clip": 0.01168127, + "auxiliary_loss_mlp": 0.01021755, + "balance_loss_clip": 1.05057144, + "balance_loss_mlp": 1.01488304, + "epoch": 0.8991763361991222, + "flos": 25520672459520.0, + "grad_norm": 2.1802925975292387, + "language_loss": 0.8453145, + "learning_rate": 1.0558749210652518e-07, + "loss": 0.86721325, + "num_input_tokens_seen": 161662560, + "step": 7478, + "time_per_iteration": 2.4409868717193604 + }, + { + "auxiliary_loss_clip": 0.01128589, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.04435384, + "balance_loss_mlp": 1.01808703, + "epoch": 0.8992965790897613, + "flos": 25119191168640.0, + "grad_norm": 1.8246832863465052, + "language_loss": 0.85857689, + "learning_rate": 1.053378781137808e-07, + "loss": 0.88011003, + "num_input_tokens_seen": 161683480, + "step": 7479, + "time_per_iteration": 2.546184778213501 + }, + { + "auxiliary_loss_clip": 0.01140802, + "auxiliary_loss_mlp": 0.01026357, + "balance_loss_clip": 1.04472256, + "balance_loss_mlp": 1.01917768, + "epoch": 0.8994168219804004, + "flos": 16070815814400.0, + "grad_norm": 1.9466141023063341, + "language_loss": 0.77657908, + "learning_rate": 1.0508855153977392e-07, + "loss": 0.79825068, + "num_input_tokens_seen": 161699945, + "step": 7480, + "time_per_iteration": 2.4394664764404297 + }, + { + "auxiliary_loss_clip": 0.01152279, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.04386854, + "balance_loss_mlp": 1.01962423, + "epoch": 0.8995370648710395, + "flos": 24825764966400.0, + "grad_norm": 2.3094500020391857, + "language_loss": 0.67153329, + "learning_rate": 1.0483951242232669e-07, + "loss": 0.69332576, + "num_input_tokens_seen": 161720420, + "step": 7481, + "time_per_iteration": 3.387246608734131 + }, + { + "auxiliary_loss_clip": 0.01061128, + "auxiliary_loss_mlp": 0.01000908, + "balance_loss_clip": 1.00747323, + "balance_loss_mlp": 1.00005579, + "epoch": 0.8996573077616786, + "flos": 63116238378240.0, + "grad_norm": 0.9730907483693916, + "language_loss": 0.57671976, + "learning_rate": 1.0459076079921936e-07, + "loss": 0.59734011, + "num_input_tokens_seen": 161773080, + "step": 7482, + "time_per_iteration": 3.7595181465148926 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.04411364, + "balance_loss_mlp": 1.02226901, + "epoch": 0.8997775506523177, + "flos": 18219674027520.0, + "grad_norm": 2.167037524681334, + "language_loss": 0.85043657, + "learning_rate": 1.0434229670818618e-07, + "loss": 0.87205529, + "num_input_tokens_seen": 161789755, + "step": 7483, + "time_per_iteration": 3.267090320587158 + }, + { + "auxiliary_loss_clip": 0.01129703, + "auxiliary_loss_mlp": 0.01022938, + "balance_loss_clip": 1.04197001, + "balance_loss_mlp": 1.01560652, + "epoch": 0.8998977935429567, + "flos": 24166768095360.0, + "grad_norm": 1.8458140546701203, + "language_loss": 0.79887593, + "learning_rate": 1.0409412018691944e-07, + "loss": 0.82040238, + "num_input_tokens_seen": 161810220, + "step": 7484, + "time_per_iteration": 2.478732109069824 + }, + { + "auxiliary_loss_clip": 0.01134222, + "auxiliary_loss_mlp": 0.010253, + "balance_loss_clip": 1.04471016, + "balance_loss_mlp": 1.01820457, + "epoch": 0.9000180364335959, + "flos": 20773030273920.0, + "grad_norm": 2.04131354953962, + "language_loss": 0.74898016, + "learning_rate": 1.0384623127306724e-07, + "loss": 0.77057534, + "num_input_tokens_seen": 161827565, + "step": 7485, + "time_per_iteration": 2.4748785495758057 + }, + { + "auxiliary_loss_clip": 0.01120198, + "auxiliary_loss_mlp": 0.01019777, + "balance_loss_clip": 1.04142404, + "balance_loss_mlp": 1.01309812, + "epoch": 0.900138279324235, + "flos": 19205745166080.0, + "grad_norm": 1.7764018029115372, + "language_loss": 0.79428732, + "learning_rate": 1.0359863000423397e-07, + "loss": 0.81568706, + "num_input_tokens_seen": 161845700, + "step": 7486, + "time_per_iteration": 2.4852969646453857 + }, + { + "auxiliary_loss_clip": 0.01166592, + "auxiliary_loss_mlp": 0.01024787, + "balance_loss_clip": 1.04826248, + "balance_loss_mlp": 1.01786065, + "epoch": 0.900258522214874, + "flos": 28731158069760.0, + "grad_norm": 1.7360435892239923, + "language_loss": 0.71823859, + "learning_rate": 1.0335131641798112e-07, + "loss": 0.74015236, + "num_input_tokens_seen": 161867660, + "step": 7487, + "time_per_iteration": 2.4582371711730957 + }, + { + "auxiliary_loss_clip": 0.01041264, + "auxiliary_loss_mlp": 0.01001337, + "balance_loss_clip": 1.00787663, + "balance_loss_mlp": 1.00045443, + "epoch": 0.9003787651055132, + "flos": 58280685655680.0, + "grad_norm": 0.9647917465211733, + "language_loss": 0.55606437, + "learning_rate": 1.0310429055182512e-07, + "loss": 0.57649028, + "num_input_tokens_seen": 161921980, + "step": 7488, + "time_per_iteration": 2.888615131378174 + }, + { + "auxiliary_loss_clip": 0.01123851, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.04195583, + "balance_loss_mlp": 1.02096105, + "epoch": 0.9004990079961522, + "flos": 25556475340800.0, + "grad_norm": 1.6885474998831453, + "language_loss": 0.74030489, + "learning_rate": 1.0285755244324024e-07, + "loss": 0.76182562, + "num_input_tokens_seen": 161942725, + "step": 7489, + "time_per_iteration": 2.5354793071746826 + }, + { + "auxiliary_loss_clip": 0.01139014, + "auxiliary_loss_mlp": 0.00761441, + "balance_loss_clip": 1.04192805, + "balance_loss_mlp": 1.0004847, + "epoch": 0.9006192508867913, + "flos": 23335185352320.0, + "grad_norm": 2.9019924795602194, + "language_loss": 0.68757826, + "learning_rate": 1.0261110212965629e-07, + "loss": 0.70658278, + "num_input_tokens_seen": 161964520, + "step": 7490, + "time_per_iteration": 3.296309471130371 + }, + { + "auxiliary_loss_clip": 0.01137709, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.04443598, + "balance_loss_mlp": 1.01904547, + "epoch": 0.9007394937774305, + "flos": 18040300485120.0, + "grad_norm": 1.889246220983641, + "language_loss": 0.7922442, + "learning_rate": 1.023649396484596e-07, + "loss": 0.81387842, + "num_input_tokens_seen": 161983575, + "step": 7491, + "time_per_iteration": 2.449601173400879 + }, + { + "auxiliary_loss_clip": 0.01164072, + "auxiliary_loss_mlp": 0.01024508, + "balance_loss_clip": 1.04584169, + "balance_loss_mlp": 1.01798701, + "epoch": 0.9008597366680695, + "flos": 43068456633600.0, + "grad_norm": 1.8900207930832382, + "language_loss": 0.67653573, + "learning_rate": 1.0211906503699275e-07, + "loss": 0.69842148, + "num_input_tokens_seen": 162006550, + "step": 7492, + "time_per_iteration": 2.5934319496154785 + }, + { + "auxiliary_loss_clip": 0.01154813, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.04832292, + "balance_loss_mlp": 1.01970506, + "epoch": 0.9009799795587086, + "flos": 14939055112320.0, + "grad_norm": 2.452430364775128, + "language_loss": 0.82468253, + "learning_rate": 1.0187347833255455e-07, + "loss": 0.84650743, + "num_input_tokens_seen": 162022455, + "step": 7493, + "time_per_iteration": 2.398282527923584 + }, + { + "auxiliary_loss_clip": 0.01162646, + "auxiliary_loss_mlp": 0.01024284, + "balance_loss_clip": 1.04701686, + "balance_loss_mlp": 1.0173043, + "epoch": 0.9011002224493477, + "flos": 21579584215680.0, + "grad_norm": 1.635963031346961, + "language_loss": 0.79221272, + "learning_rate": 1.0162817957240056e-07, + "loss": 0.81408203, + "num_input_tokens_seen": 162042350, + "step": 7494, + "time_per_iteration": 2.407040596008301 + }, + { + "auxiliary_loss_clip": 0.0105174, + "auxiliary_loss_mlp": 0.01001423, + "balance_loss_clip": 1.00783718, + "balance_loss_mlp": 1.0005703, + "epoch": 0.9012204653399868, + "flos": 71166367883520.0, + "grad_norm": 0.8835582868496925, + "language_loss": 0.63025457, + "learning_rate": 1.0138316879374253e-07, + "loss": 0.65078616, + "num_input_tokens_seen": 162111640, + "step": 7495, + "time_per_iteration": 3.1716079711914062 + }, + { + "auxiliary_loss_clip": 0.01141133, + "auxiliary_loss_mlp": 0.01021399, + "balance_loss_clip": 1.04807615, + "balance_loss_mlp": 1.01397514, + "epoch": 0.9013407082306258, + "flos": 15594963413760.0, + "grad_norm": 3.6988235192551513, + "language_loss": 0.74481285, + "learning_rate": 1.0113844603374833e-07, + "loss": 0.76643813, + "num_input_tokens_seen": 162128165, + "step": 7496, + "time_per_iteration": 2.4523158073425293 + }, + { + "auxiliary_loss_clip": 0.01136963, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_clip": 1.04221725, + "balance_loss_mlp": 1.01631284, + "epoch": 0.901460951121265, + "flos": 15049157276160.0, + "grad_norm": 3.226879365709535, + "language_loss": 0.72016931, + "learning_rate": 1.0089401132954178e-07, + "loss": 0.74178082, + "num_input_tokens_seen": 162146145, + "step": 7497, + "time_per_iteration": 2.4674620628356934 + }, + { + "auxiliary_loss_clip": 0.01139322, + "auxiliary_loss_mlp": 0.0102361, + "balance_loss_clip": 1.04646587, + "balance_loss_mlp": 1.01690114, + "epoch": 0.9015811940119041, + "flos": 22236857233920.0, + "grad_norm": 1.580516604419315, + "language_loss": 0.72286189, + "learning_rate": 1.006498647182037e-07, + "loss": 0.74449122, + "num_input_tokens_seen": 162164800, + "step": 7498, + "time_per_iteration": 2.483145236968994 + }, + { + "auxiliary_loss_clip": 0.01092436, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.03913879, + "balance_loss_mlp": 1.0205512, + "epoch": 0.9017014369025431, + "flos": 24973824827520.0, + "grad_norm": 2.358434843356064, + "language_loss": 0.7148248, + "learning_rate": 1.004060062367713e-07, + "loss": 0.73602712, + "num_input_tokens_seen": 162185895, + "step": 7499, + "time_per_iteration": 2.6162989139556885 + }, + { + "auxiliary_loss_clip": 0.0115358, + "auxiliary_loss_mlp": 0.01023289, + "balance_loss_clip": 1.04602504, + "balance_loss_mlp": 1.01587462, + "epoch": 0.9018216797931822, + "flos": 18114168804480.0, + "grad_norm": 1.739161401098802, + "language_loss": 0.69203043, + "learning_rate": 1.0016243592223728e-07, + "loss": 0.71379912, + "num_input_tokens_seen": 162206295, + "step": 7500, + "time_per_iteration": 2.4434964656829834 + }, + { + "auxiliary_loss_clip": 0.01094081, + "auxiliary_loss_mlp": 0.01024004, + "balance_loss_clip": 1.04049206, + "balance_loss_mlp": 1.01689661, + "epoch": 0.9019419226838213, + "flos": 37268452759680.0, + "grad_norm": 1.856609138793378, + "language_loss": 0.65798825, + "learning_rate": 9.991915381155114e-08, + "loss": 0.67916918, + "num_input_tokens_seen": 162229275, + "step": 7501, + "time_per_iteration": 2.699352502822876 + }, + { + "auxiliary_loss_clip": 0.01154714, + "auxiliary_loss_mlp": 0.01023093, + "balance_loss_clip": 1.04588509, + "balance_loss_mlp": 1.01587462, + "epoch": 0.9020621655744604, + "flos": 23441121538560.0, + "grad_norm": 2.0112876345868647, + "language_loss": 0.74872202, + "learning_rate": 9.967615994161871e-08, + "loss": 0.77050006, + "num_input_tokens_seen": 162248935, + "step": 7502, + "time_per_iteration": 2.4711592197418213 + }, + { + "auxiliary_loss_clip": 0.0116393, + "auxiliary_loss_mlp": 0.01021335, + "balance_loss_clip": 1.04577315, + "balance_loss_mlp": 1.01435566, + "epoch": 0.9021824084650995, + "flos": 22857465444480.0, + "grad_norm": 1.7030176325353608, + "language_loss": 0.78332675, + "learning_rate": 9.943345434930161e-08, + "loss": 0.80517936, + "num_input_tokens_seen": 162269185, + "step": 7503, + "time_per_iteration": 2.4451146125793457 + }, + { + "auxiliary_loss_clip": 0.01123429, + "auxiliary_loss_mlp": 0.01023997, + "balance_loss_clip": 1.04454303, + "balance_loss_mlp": 1.01692176, + "epoch": 0.9023026513557386, + "flos": 22127581082880.0, + "grad_norm": 1.8910205186235143, + "language_loss": 0.68888879, + "learning_rate": 9.919103707141885e-08, + "loss": 0.71036303, + "num_input_tokens_seen": 162288065, + "step": 7504, + "time_per_iteration": 2.504936456680298 + }, + { + "auxiliary_loss_clip": 0.01149092, + "auxiliary_loss_mlp": 0.01025003, + "balance_loss_clip": 1.04483533, + "balance_loss_mlp": 1.01733744, + "epoch": 0.9024228942463777, + "flos": 24199087357440.0, + "grad_norm": 1.8683829897163502, + "language_loss": 0.76707155, + "learning_rate": 9.89489081447441e-08, + "loss": 0.78881252, + "num_input_tokens_seen": 162305265, + "step": 7505, + "time_per_iteration": 2.436614751815796 + }, + { + "auxiliary_loss_clip": 0.01136553, + "auxiliary_loss_mlp": 0.01021817, + "balance_loss_clip": 1.04272437, + "balance_loss_mlp": 1.0144136, + "epoch": 0.9025431371370167, + "flos": 25008262992000.0, + "grad_norm": 1.8462170707015797, + "language_loss": 0.83278984, + "learning_rate": 9.870706760600844e-08, + "loss": 0.85437357, + "num_input_tokens_seen": 162325215, + "step": 7506, + "time_per_iteration": 2.4989330768585205 + }, + { + "auxiliary_loss_clip": 0.01118712, + "auxiliary_loss_mlp": 0.01026842, + "balance_loss_clip": 1.04765582, + "balance_loss_mlp": 1.01949906, + "epoch": 0.9026633800276559, + "flos": 18952862440320.0, + "grad_norm": 2.492504751861586, + "language_loss": 0.72674954, + "learning_rate": 9.846551549189918e-08, + "loss": 0.74820513, + "num_input_tokens_seen": 162344820, + "step": 7507, + "time_per_iteration": 3.3484420776367188 + }, + { + "auxiliary_loss_clip": 0.01135015, + "auxiliary_loss_mlp": 0.01023458, + "balance_loss_clip": 1.04397464, + "balance_loss_mlp": 1.01621294, + "epoch": 0.902783622918295, + "flos": 32416059536640.0, + "grad_norm": 1.9638969202479812, + "language_loss": 0.68712258, + "learning_rate": 9.822425183905902e-08, + "loss": 0.70870733, + "num_input_tokens_seen": 162365345, + "step": 7508, + "time_per_iteration": 2.557969808578491 + }, + { + "auxiliary_loss_clip": 0.01032407, + "auxiliary_loss_mlp": 0.01001098, + "balance_loss_clip": 1.00843966, + "balance_loss_mlp": 1.00021541, + "epoch": 0.902903865808934, + "flos": 63717453244800.0, + "grad_norm": 0.9300813243559274, + "language_loss": 0.7522254, + "learning_rate": 9.798327668408823e-08, + "loss": 0.77256048, + "num_input_tokens_seen": 162426980, + "step": 7509, + "time_per_iteration": 3.8911752700805664 + }, + { + "auxiliary_loss_clip": 0.0116887, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.0476203, + "balance_loss_mlp": 1.01756859, + "epoch": 0.9030241086995732, + "flos": 23804034600960.0, + "grad_norm": 2.3033024802018858, + "language_loss": 0.68901622, + "learning_rate": 9.774259006354158e-08, + "loss": 0.71095628, + "num_input_tokens_seen": 162447050, + "step": 7510, + "time_per_iteration": 3.165647268295288 + }, + { + "auxiliary_loss_clip": 0.0114119, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.0435791, + "balance_loss_mlp": 1.01893759, + "epoch": 0.9031443515902122, + "flos": 26395887248640.0, + "grad_norm": 1.9847206849650965, + "language_loss": 0.75995255, + "learning_rate": 9.750219201393184e-08, + "loss": 0.78162432, + "num_input_tokens_seen": 162467015, + "step": 7511, + "time_per_iteration": 2.5126726627349854 + }, + { + "auxiliary_loss_clip": 0.0114898, + "auxiliary_loss_mlp": 0.01019428, + "balance_loss_clip": 1.0442965, + "balance_loss_mlp": 1.0123018, + "epoch": 0.9032645944808513, + "flos": 24939350749440.0, + "grad_norm": 1.922756282202372, + "language_loss": 0.77448583, + "learning_rate": 9.726208257172697e-08, + "loss": 0.79616994, + "num_input_tokens_seen": 162488710, + "step": 7512, + "time_per_iteration": 2.4924213886260986 + }, + { + "auxiliary_loss_clip": 0.01164692, + "auxiliary_loss_mlp": 0.01022105, + "balance_loss_clip": 1.04710519, + "balance_loss_mlp": 1.0151639, + "epoch": 0.9033848373714904, + "flos": 21178821196800.0, + "grad_norm": 2.6589378993722392, + "language_loss": 0.74869305, + "learning_rate": 9.702226177335115e-08, + "loss": 0.77056104, + "num_input_tokens_seen": 162507205, + "step": 7513, + "time_per_iteration": 2.4249424934387207 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.04524136, + "balance_loss_mlp": 1.02432549, + "epoch": 0.9035050802621295, + "flos": 26286359702400.0, + "grad_norm": 1.7505175252544227, + "language_loss": 0.72428823, + "learning_rate": 9.67827296551853e-08, + "loss": 0.7459718, + "num_input_tokens_seen": 162528490, + "step": 7514, + "time_per_iteration": 2.5159690380096436 + }, + { + "auxiliary_loss_clip": 0.01129064, + "auxiliary_loss_mlp": 0.00761706, + "balance_loss_clip": 1.040609, + "balance_loss_mlp": 1.00048304, + "epoch": 0.9036253231527686, + "flos": 24204546224640.0, + "grad_norm": 2.1967319204783156, + "language_loss": 0.68626791, + "learning_rate": 9.65434862535659e-08, + "loss": 0.70517558, + "num_input_tokens_seen": 162547860, + "step": 7515, + "time_per_iteration": 2.486316204071045 + }, + { + "auxiliary_loss_clip": 0.01138782, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.04319763, + "balance_loss_mlp": 1.0222944, + "epoch": 0.9037455660434077, + "flos": 18072655660800.0, + "grad_norm": 3.026203917082258, + "language_loss": 0.64671063, + "learning_rate": 9.630453160478635e-08, + "loss": 0.66839582, + "num_input_tokens_seen": 162563215, + "step": 7516, + "time_per_iteration": 2.4332194328308105 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.01022925, + "balance_loss_clip": 1.04214573, + "balance_loss_mlp": 1.01568878, + "epoch": 0.9038658089340468, + "flos": 24060795995520.0, + "grad_norm": 1.6804954566547874, + "language_loss": 0.82410437, + "learning_rate": 9.60658657450959e-08, + "loss": 0.84545183, + "num_input_tokens_seen": 162583515, + "step": 7517, + "time_per_iteration": 3.349963426589966 + }, + { + "auxiliary_loss_clip": 0.01122738, + "auxiliary_loss_mlp": 0.01024807, + "balance_loss_clip": 1.04015434, + "balance_loss_mlp": 1.01776767, + "epoch": 0.9039860518246858, + "flos": 21834298535040.0, + "grad_norm": 1.5632127066299444, + "language_loss": 0.79289317, + "learning_rate": 9.582748871069979e-08, + "loss": 0.81436861, + "num_input_tokens_seen": 162602955, + "step": 7518, + "time_per_iteration": 2.5092217922210693 + }, + { + "auxiliary_loss_clip": 0.01138744, + "auxiliary_loss_mlp": 0.00761296, + "balance_loss_clip": 1.04273462, + "balance_loss_mlp": 1.00042844, + "epoch": 0.904106294715325, + "flos": 26614870513920.0, + "grad_norm": 2.177561591943742, + "language_loss": 0.82648271, + "learning_rate": 9.558940053775954e-08, + "loss": 0.84548306, + "num_input_tokens_seen": 162621595, + "step": 7519, + "time_per_iteration": 2.578040599822998 + }, + { + "auxiliary_loss_clip": 0.01149752, + "auxiliary_loss_mlp": 0.01026148, + "balance_loss_clip": 1.04571056, + "balance_loss_mlp": 1.01901889, + "epoch": 0.904226537605964, + "flos": 17785693906560.0, + "grad_norm": 6.918367209100176, + "language_loss": 0.68074358, + "learning_rate": 9.535160126239294e-08, + "loss": 0.70250249, + "num_input_tokens_seen": 162638220, + "step": 7520, + "time_per_iteration": 2.4742648601531982 + }, + { + "auxiliary_loss_clip": 0.01149248, + "auxiliary_loss_mlp": 0.01025002, + "balance_loss_clip": 1.04616082, + "balance_loss_mlp": 1.01804328, + "epoch": 0.9043467804966031, + "flos": 24790428961920.0, + "grad_norm": 1.5518936283260256, + "language_loss": 0.70606309, + "learning_rate": 9.511409092067424e-08, + "loss": 0.72780561, + "num_input_tokens_seen": 162658575, + "step": 7521, + "time_per_iteration": 2.55526065826416 + }, + { + "auxiliary_loss_clip": 0.01139797, + "auxiliary_loss_mlp": 0.01022345, + "balance_loss_clip": 1.0469507, + "balance_loss_mlp": 1.01537108, + "epoch": 0.9044670233872423, + "flos": 22632125472000.0, + "grad_norm": 1.7966911876723506, + "language_loss": 0.67536676, + "learning_rate": 9.487686954863327e-08, + "loss": 0.69698817, + "num_input_tokens_seen": 162678295, + "step": 7522, + "time_per_iteration": 2.4851834774017334 + }, + { + "auxiliary_loss_clip": 0.01149355, + "auxiliary_loss_mlp": 0.01023135, + "balance_loss_clip": 1.04549479, + "balance_loss_mlp": 1.01616693, + "epoch": 0.9045872662778813, + "flos": 23771320289280.0, + "grad_norm": 5.003024867350654, + "language_loss": 0.76886868, + "learning_rate": 9.46399371822566e-08, + "loss": 0.79059362, + "num_input_tokens_seen": 162698070, + "step": 7523, + "time_per_iteration": 2.4811840057373047 + }, + { + "auxiliary_loss_clip": 0.01165438, + "auxiliary_loss_mlp": 0.01023687, + "balance_loss_clip": 1.04738045, + "balance_loss_mlp": 1.01633167, + "epoch": 0.9047075091685204, + "flos": 15191039998080.0, + "grad_norm": 3.0669154921102857, + "language_loss": 0.72337472, + "learning_rate": 9.440329385748657e-08, + "loss": 0.74526596, + "num_input_tokens_seen": 162715140, + "step": 7524, + "time_per_iteration": 2.378437042236328 + }, + { + "auxiliary_loss_clip": 0.0112303, + "auxiliary_loss_mlp": 0.01017748, + "balance_loss_clip": 1.04415119, + "balance_loss_mlp": 1.01148367, + "epoch": 0.9048277520591596, + "flos": 18003707504640.0, + "grad_norm": 1.6884552421013848, + "language_loss": 0.70513642, + "learning_rate": 9.416693961022137e-08, + "loss": 0.72654414, + "num_input_tokens_seen": 162733390, + "step": 7525, + "time_per_iteration": 2.4754233360290527 + }, + { + "auxiliary_loss_clip": 0.01083976, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.03895688, + "balance_loss_mlp": 1.01720452, + "epoch": 0.9049479949497986, + "flos": 21872471713920.0, + "grad_norm": 1.6828772868579742, + "language_loss": 0.77079493, + "learning_rate": 9.393087447631654e-08, + "loss": 0.79187691, + "num_input_tokens_seen": 162751670, + "step": 7526, + "time_per_iteration": 2.560612678527832 + }, + { + "auxiliary_loss_clip": 0.01137852, + "auxiliary_loss_mlp": 0.01019729, + "balance_loss_clip": 1.04315889, + "balance_loss_mlp": 1.01331544, + "epoch": 0.9050682378404377, + "flos": 20773928113920.0, + "grad_norm": 1.8653675147424515, + "language_loss": 0.72918928, + "learning_rate": 9.36950984915823e-08, + "loss": 0.75076509, + "num_input_tokens_seen": 162770025, + "step": 7527, + "time_per_iteration": 2.4717085361480713 + }, + { + "auxiliary_loss_clip": 0.01167262, + "auxiliary_loss_mlp": 0.01025271, + "balance_loss_clip": 1.0491178, + "balance_loss_mlp": 1.01799929, + "epoch": 0.9051884807310768, + "flos": 21580015178880.0, + "grad_norm": 1.7162262224571077, + "language_loss": 0.6927138, + "learning_rate": 9.345961169178607e-08, + "loss": 0.71463913, + "num_input_tokens_seen": 162789710, + "step": 7528, + "time_per_iteration": 2.4101996421813965 + }, + { + "auxiliary_loss_clip": 0.01113461, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.04762328, + "balance_loss_mlp": 1.01943541, + "epoch": 0.9053087236217159, + "flos": 21908059113600.0, + "grad_norm": 1.4055299782988353, + "language_loss": 0.7284615, + "learning_rate": 9.322441411265081e-08, + "loss": 0.74986017, + "num_input_tokens_seen": 162810695, + "step": 7529, + "time_per_iteration": 2.531101703643799 + }, + { + "auxiliary_loss_clip": 0.01133867, + "auxiliary_loss_mlp": 0.01024464, + "balance_loss_clip": 1.04462218, + "balance_loss_mlp": 1.01745772, + "epoch": 0.9054289665123549, + "flos": 17055809544960.0, + "grad_norm": 1.9134742899752655, + "language_loss": 0.72953081, + "learning_rate": 9.298950578985554e-08, + "loss": 0.75111413, + "num_input_tokens_seen": 162827770, + "step": 7530, + "time_per_iteration": 2.532728672027588 + }, + { + "auxiliary_loss_clip": 0.01146184, + "auxiliary_loss_mlp": 0.00762121, + "balance_loss_clip": 1.04526711, + "balance_loss_mlp": 1.00035715, + "epoch": 0.905549209402994, + "flos": 20777268078720.0, + "grad_norm": 1.6917842593366994, + "language_loss": 0.70836216, + "learning_rate": 9.275488675903665e-08, + "loss": 0.72744524, + "num_input_tokens_seen": 162846715, + "step": 7531, + "time_per_iteration": 2.448601722717285 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.0102543, + "balance_loss_clip": 1.04351771, + "balance_loss_mlp": 1.01837349, + "epoch": 0.9056694522936332, + "flos": 21686813291520.0, + "grad_norm": 1.9192459311768664, + "language_loss": 0.73923755, + "learning_rate": 9.252055705578454e-08, + "loss": 0.76058316, + "num_input_tokens_seen": 162866215, + "step": 7532, + "time_per_iteration": 2.5558061599731445 + }, + { + "auxiliary_loss_clip": 0.01149128, + "auxiliary_loss_mlp": 0.01024256, + "balance_loss_clip": 1.04382646, + "balance_loss_mlp": 1.017717, + "epoch": 0.9057896951842722, + "flos": 29569133433600.0, + "grad_norm": 1.6143226344728039, + "language_loss": 0.71910238, + "learning_rate": 9.228651671564747e-08, + "loss": 0.74083626, + "num_input_tokens_seen": 162888245, + "step": 7533, + "time_per_iteration": 2.512439012527466 + }, + { + "auxiliary_loss_clip": 0.01104851, + "auxiliary_loss_mlp": 0.01024418, + "balance_loss_clip": 1.04348254, + "balance_loss_mlp": 1.01763821, + "epoch": 0.9059099380749113, + "flos": 27892248952320.0, + "grad_norm": 1.567067747710581, + "language_loss": 0.77892238, + "learning_rate": 9.205276577412901e-08, + "loss": 0.80021507, + "num_input_tokens_seen": 162911025, + "step": 7534, + "time_per_iteration": 3.405759572982788 + }, + { + "auxiliary_loss_clip": 0.0114276, + "auxiliary_loss_mlp": 0.00761577, + "balance_loss_clip": 1.0430876, + "balance_loss_mlp": 1.00043666, + "epoch": 0.9060301809655504, + "flos": 17748993185280.0, + "grad_norm": 2.6138948860132687, + "language_loss": 0.77097189, + "learning_rate": 9.181930426668905e-08, + "loss": 0.79001522, + "num_input_tokens_seen": 162927820, + "step": 7535, + "time_per_iteration": 2.4517784118652344 + }, + { + "auxiliary_loss_clip": 0.01108211, + "auxiliary_loss_mlp": 0.01023992, + "balance_loss_clip": 1.04355836, + "balance_loss_mlp": 1.01737297, + "epoch": 0.9061504238561895, + "flos": 31759432963200.0, + "grad_norm": 1.5695747644015576, + "language_loss": 0.67939436, + "learning_rate": 9.158613222874346e-08, + "loss": 0.70071638, + "num_input_tokens_seen": 162949445, + "step": 7536, + "time_per_iteration": 4.1521477699279785 + }, + { + "auxiliary_loss_clip": 0.01134582, + "auxiliary_loss_mlp": 0.01020546, + "balance_loss_clip": 1.04255736, + "balance_loss_mlp": 1.0136584, + "epoch": 0.9062706667468285, + "flos": 20048066075520.0, + "grad_norm": 1.625523697874035, + "language_loss": 0.81722605, + "learning_rate": 9.135324969566394e-08, + "loss": 0.8387773, + "num_input_tokens_seen": 162968945, + "step": 7537, + "time_per_iteration": 2.476557493209839 + }, + { + "auxiliary_loss_clip": 0.0115614, + "auxiliary_loss_mlp": 0.01025756, + "balance_loss_clip": 1.0471549, + "balance_loss_mlp": 1.01867819, + "epoch": 0.9063909096374677, + "flos": 18437292576000.0, + "grad_norm": 1.785844906890705, + "language_loss": 0.75357199, + "learning_rate": 9.112065670277913e-08, + "loss": 0.77539092, + "num_input_tokens_seen": 162985310, + "step": 7538, + "time_per_iteration": 2.420950174331665 + }, + { + "auxiliary_loss_clip": 0.01134283, + "auxiliary_loss_mlp": 0.01022287, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.01562071, + "epoch": 0.9065111525281068, + "flos": 33547353361920.0, + "grad_norm": 2.1834346437163754, + "language_loss": 0.72660363, + "learning_rate": 9.088835328537303e-08, + "loss": 0.7481693, + "num_input_tokens_seen": 163006900, + "step": 7539, + "time_per_iteration": 2.586735248565674 + }, + { + "auxiliary_loss_clip": 0.0114094, + "auxiliary_loss_mlp": 0.01023267, + "balance_loss_clip": 1.04530478, + "balance_loss_mlp": 1.01602483, + "epoch": 0.9066313954187458, + "flos": 23367863750400.0, + "grad_norm": 11.837360681386912, + "language_loss": 0.71457005, + "learning_rate": 9.065633947868568e-08, + "loss": 0.73621213, + "num_input_tokens_seen": 163026505, + "step": 7540, + "time_per_iteration": 2.5033652782440186 + }, + { + "auxiliary_loss_clip": 0.01123452, + "auxiliary_loss_mlp": 0.00761718, + "balance_loss_clip": 1.04577088, + "balance_loss_mlp": 1.00037241, + "epoch": 0.906751638309385, + "flos": 26249623067520.0, + "grad_norm": 2.3554558908120646, + "language_loss": 0.79849142, + "learning_rate": 9.042461531791379e-08, + "loss": 0.81734312, + "num_input_tokens_seen": 163044925, + "step": 7541, + "time_per_iteration": 2.55106520652771 + }, + { + "auxiliary_loss_clip": 0.01161096, + "auxiliary_loss_mlp": 0.01022355, + "balance_loss_clip": 1.04542935, + "balance_loss_mlp": 1.01575983, + "epoch": 0.906871881200024, + "flos": 16544477485440.0, + "grad_norm": 1.7459686267517114, + "language_loss": 0.78215909, + "learning_rate": 9.019318083820903e-08, + "loss": 0.80399358, + "num_input_tokens_seen": 163063505, + "step": 7542, + "time_per_iteration": 2.4267797470092773 + }, + { + "auxiliary_loss_clip": 0.01150141, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.04700756, + "balance_loss_mlp": 1.02212477, + "epoch": 0.9069921240906631, + "flos": 24605129675520.0, + "grad_norm": 1.6561832999381751, + "language_loss": 0.85298896, + "learning_rate": 8.996203607468045e-08, + "loss": 0.87478125, + "num_input_tokens_seen": 163082505, + "step": 7543, + "time_per_iteration": 2.4794650077819824 + }, + { + "auxiliary_loss_clip": 0.01147579, + "auxiliary_loss_mlp": 0.01025188, + "balance_loss_clip": 1.04332185, + "balance_loss_mlp": 1.01804733, + "epoch": 0.9071123669813023, + "flos": 25374731500800.0, + "grad_norm": 1.503951133656035, + "language_loss": 0.75568414, + "learning_rate": 8.973118106239241e-08, + "loss": 0.77741194, + "num_input_tokens_seen": 163105110, + "step": 7544, + "time_per_iteration": 3.316143274307251 + }, + { + "auxiliary_loss_clip": 0.01092663, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03725302, + "balance_loss_mlp": 1.02018106, + "epoch": 0.9072326098719413, + "flos": 26725798690560.0, + "grad_norm": 1.9916631437492198, + "language_loss": 0.94762319, + "learning_rate": 8.95006158363656e-08, + "loss": 0.96882391, + "num_input_tokens_seen": 163125295, + "step": 7545, + "time_per_iteration": 2.6222267150878906 + }, + { + "auxiliary_loss_clip": 0.01152857, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.04871011, + "balance_loss_mlp": 1.02144623, + "epoch": 0.9073528527625804, + "flos": 23878800760320.0, + "grad_norm": 1.8083816990157908, + "language_loss": 0.77342892, + "learning_rate": 8.9270340431576e-08, + "loss": 0.79524761, + "num_input_tokens_seen": 163144385, + "step": 7546, + "time_per_iteration": 2.461282253265381 + }, + { + "auxiliary_loss_clip": 0.01153047, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.04460895, + "balance_loss_mlp": 1.01890039, + "epoch": 0.9074730956532195, + "flos": 37852144767360.0, + "grad_norm": 2.0768832205644254, + "language_loss": 0.7363615, + "learning_rate": 8.904035488295658e-08, + "loss": 0.75815123, + "num_input_tokens_seen": 163163885, + "step": 7547, + "time_per_iteration": 2.592989683151245 + }, + { + "auxiliary_loss_clip": 0.01051382, + "auxiliary_loss_mlp": 0.00752984, + "balance_loss_clip": 1.00768936, + "balance_loss_mlp": 0.99991649, + "epoch": 0.9075933385438586, + "flos": 65173307385600.0, + "grad_norm": 0.7052276313477072, + "language_loss": 0.53276253, + "learning_rate": 8.881065922539632e-08, + "loss": 0.55080622, + "num_input_tokens_seen": 163224325, + "step": 7548, + "time_per_iteration": 2.9635889530181885 + }, + { + "auxiliary_loss_clip": 0.01115727, + "auxiliary_loss_mlp": 0.01018214, + "balance_loss_clip": 1.04232168, + "balance_loss_mlp": 1.0118897, + "epoch": 0.9077135814344977, + "flos": 19931571290880.0, + "grad_norm": 2.2110591409263756, + "language_loss": 0.73235798, + "learning_rate": 8.85812534937389e-08, + "loss": 0.7536974, + "num_input_tokens_seen": 163242425, + "step": 7549, + "time_per_iteration": 2.503549337387085 + }, + { + "auxiliary_loss_clip": 0.01157539, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.04628873, + "balance_loss_mlp": 1.01960421, + "epoch": 0.9078338243251368, + "flos": 17529650784000.0, + "grad_norm": 2.5991638556843792, + "language_loss": 0.67745119, + "learning_rate": 8.835213772278583e-08, + "loss": 0.69929463, + "num_input_tokens_seen": 163259280, + "step": 7550, + "time_per_iteration": 2.405254602432251 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01021718, + "balance_loss_clip": 1.04352665, + "balance_loss_mlp": 1.0150156, + "epoch": 0.9079540672157759, + "flos": 28803410277120.0, + "grad_norm": 1.7372053746226181, + "language_loss": 0.78826368, + "learning_rate": 8.812331194729373e-08, + "loss": 0.809623, + "num_input_tokens_seen": 163278925, + "step": 7551, + "time_per_iteration": 2.5577621459960938 + }, + { + "auxiliary_loss_clip": 0.01172598, + "auxiliary_loss_mlp": 0.01024896, + "balance_loss_clip": 1.05264294, + "balance_loss_mlp": 1.01746345, + "epoch": 0.9080743101064149, + "flos": 23513840622720.0, + "grad_norm": 1.8450358039537986, + "language_loss": 0.72374737, + "learning_rate": 8.789477620197461e-08, + "loss": 0.74572229, + "num_input_tokens_seen": 163298450, + "step": 7552, + "time_per_iteration": 2.4289121627807617 + }, + { + "auxiliary_loss_clip": 0.0113478, + "auxiliary_loss_mlp": 0.01025533, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.01837099, + "epoch": 0.9081945529970541, + "flos": 22778102344320.0, + "grad_norm": 2.3035225049834023, + "language_loss": 0.79151344, + "learning_rate": 8.766653052149831e-08, + "loss": 0.81311661, + "num_input_tokens_seen": 163313635, + "step": 7553, + "time_per_iteration": 2.4660696983337402 + }, + { + "auxiliary_loss_clip": 0.0113367, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.04270899, + "balance_loss_mlp": 1.01934838, + "epoch": 0.9083147958876931, + "flos": 18873714821760.0, + "grad_norm": 2.1813363136202804, + "language_loss": 0.74388784, + "learning_rate": 8.743857494048823e-08, + "loss": 0.76549333, + "num_input_tokens_seen": 163330450, + "step": 7554, + "time_per_iteration": 2.4595232009887695 + }, + { + "auxiliary_loss_clip": 0.01122732, + "auxiliary_loss_mlp": 0.01027159, + "balance_loss_clip": 1.04260564, + "balance_loss_mlp": 1.01972318, + "epoch": 0.9084350387783322, + "flos": 18909374048640.0, + "grad_norm": 2.3014308346613634, + "language_loss": 0.62604713, + "learning_rate": 8.721090949352605e-08, + "loss": 0.64754605, + "num_input_tokens_seen": 163346690, + "step": 7555, + "time_per_iteration": 2.4897663593292236 + }, + { + "auxiliary_loss_clip": 0.01160267, + "auxiliary_loss_mlp": 0.01025307, + "balance_loss_clip": 1.04845309, + "balance_loss_mlp": 1.0175879, + "epoch": 0.9085552816689714, + "flos": 20595488325120.0, + "grad_norm": 2.959404170864298, + "language_loss": 0.73229825, + "learning_rate": 8.698353421514793e-08, + "loss": 0.75415397, + "num_input_tokens_seen": 163365065, + "step": 7556, + "time_per_iteration": 2.4372453689575195 + }, + { + "auxiliary_loss_clip": 0.01152229, + "auxiliary_loss_mlp": 0.01025786, + "balance_loss_clip": 1.04782963, + "balance_loss_mlp": 1.01907396, + "epoch": 0.9086755245596104, + "flos": 18113163223680.0, + "grad_norm": 2.5466287973693644, + "language_loss": 0.80263793, + "learning_rate": 8.67564491398467e-08, + "loss": 0.82441807, + "num_input_tokens_seen": 163382070, + "step": 7557, + "time_per_iteration": 2.4207801818847656 + }, + { + "auxiliary_loss_clip": 0.01152376, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.04449224, + "balance_loss_mlp": 1.01765227, + "epoch": 0.9087957674502495, + "flos": 19129793857920.0, + "grad_norm": 1.829358981837066, + "language_loss": 0.74024117, + "learning_rate": 8.652965430207104e-08, + "loss": 0.76201928, + "num_input_tokens_seen": 163399975, + "step": 7558, + "time_per_iteration": 2.4300026893615723 + }, + { + "auxiliary_loss_clip": 0.01154412, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.04613686, + "balance_loss_mlp": 1.01985383, + "epoch": 0.9089160103408886, + "flos": 18109930999680.0, + "grad_norm": 2.7621351215237477, + "language_loss": 0.65728498, + "learning_rate": 8.630314973622521e-08, + "loss": 0.6791048, + "num_input_tokens_seen": 163417520, + "step": 7559, + "time_per_iteration": 2.4251675605773926 + }, + { + "auxiliary_loss_clip": 0.01148022, + "auxiliary_loss_mlp": 0.01023092, + "balance_loss_clip": 1.0470233, + "balance_loss_mlp": 1.01640701, + "epoch": 0.9090362532315277, + "flos": 33364855336320.0, + "grad_norm": 6.0647576622088, + "language_loss": 0.7082597, + "learning_rate": 8.607693547666995e-08, + "loss": 0.72997075, + "num_input_tokens_seen": 163440060, + "step": 7560, + "time_per_iteration": 3.3738601207733154 + }, + { + "auxiliary_loss_clip": 0.01032142, + "auxiliary_loss_mlp": 0.01002092, + "balance_loss_clip": 1.00794005, + "balance_loss_mlp": 1.00115621, + "epoch": 0.9091564961221668, + "flos": 71480585082240.0, + "grad_norm": 0.945148805249054, + "language_loss": 0.58006293, + "learning_rate": 8.585101155772201e-08, + "loss": 0.60040528, + "num_input_tokens_seen": 163502180, + "step": 7561, + "time_per_iteration": 3.174197196960449 + }, + { + "auxiliary_loss_clip": 0.01128535, + "auxiliary_loss_mlp": 0.01025505, + "balance_loss_clip": 1.03911686, + "balance_loss_mlp": 1.01817942, + "epoch": 0.9092767390128058, + "flos": 24712574232960.0, + "grad_norm": 1.871363762117446, + "language_loss": 0.68608797, + "learning_rate": 8.562537801365377e-08, + "loss": 0.70762837, + "num_input_tokens_seen": 163521915, + "step": 7562, + "time_per_iteration": 3.236593246459961 + }, + { + "auxiliary_loss_clip": 0.0116584, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.04670584, + "balance_loss_mlp": 1.02020562, + "epoch": 0.909396981903445, + "flos": 23586487879680.0, + "grad_norm": 1.7726976425605123, + "language_loss": 0.70005274, + "learning_rate": 8.540003487869362e-08, + "loss": 0.72198522, + "num_input_tokens_seen": 163543585, + "step": 7563, + "time_per_iteration": 3.2606747150421143 + }, + { + "auxiliary_loss_clip": 0.0111132, + "auxiliary_loss_mlp": 0.0102229, + "balance_loss_clip": 1.04052711, + "balance_loss_mlp": 1.01507771, + "epoch": 0.909517224794084, + "flos": 23404169422080.0, + "grad_norm": 1.758883586674906, + "language_loss": 0.80005574, + "learning_rate": 8.517498218702557e-08, + "loss": 0.82139182, + "num_input_tokens_seen": 163561515, + "step": 7564, + "time_per_iteration": 2.5211315155029297 + }, + { + "auxiliary_loss_clip": 0.0111875, + "auxiliary_loss_mlp": 0.01020598, + "balance_loss_clip": 1.0425452, + "balance_loss_mlp": 1.01372004, + "epoch": 0.9096374676847231, + "flos": 19208618254080.0, + "grad_norm": 1.602903414177587, + "language_loss": 0.69473499, + "learning_rate": 8.49502199727905e-08, + "loss": 0.71612847, + "num_input_tokens_seen": 163579540, + "step": 7565, + "time_per_iteration": 2.50368070602417 + }, + { + "auxiliary_loss_clip": 0.01146143, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.04178071, + "balance_loss_mlp": 1.01820064, + "epoch": 0.9097577105753623, + "flos": 33292495388160.0, + "grad_norm": 2.8865628899563283, + "language_loss": 0.66459376, + "learning_rate": 8.472574827008428e-08, + "loss": 0.68630922, + "num_input_tokens_seen": 163600425, + "step": 7566, + "time_per_iteration": 2.5603184700012207 + }, + { + "auxiliary_loss_clip": 0.01149132, + "auxiliary_loss_mlp": 0.01023433, + "balance_loss_clip": 1.04352951, + "balance_loss_mlp": 1.01647758, + "epoch": 0.9098779534660013, + "flos": 21906443001600.0, + "grad_norm": 1.6950286568803576, + "language_loss": 0.84148777, + "learning_rate": 8.450156711295942e-08, + "loss": 0.86321342, + "num_input_tokens_seen": 163620595, + "step": 7567, + "time_per_iteration": 2.449101448059082 + }, + { + "auxiliary_loss_clip": 0.01137527, + "auxiliary_loss_mlp": 0.01025478, + "balance_loss_clip": 1.04732513, + "balance_loss_mlp": 1.01843262, + "epoch": 0.9099981963566404, + "flos": 25730354102400.0, + "grad_norm": 1.9698055534687635, + "language_loss": 0.86512679, + "learning_rate": 8.427767653542383e-08, + "loss": 0.88675684, + "num_input_tokens_seen": 163635765, + "step": 7568, + "time_per_iteration": 2.5017755031585693 + }, + { + "auxiliary_loss_clip": 0.01102267, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.03823709, + "balance_loss_mlp": 1.0188688, + "epoch": 0.9101184392472795, + "flos": 21069437304960.0, + "grad_norm": 2.4978980834833613, + "language_loss": 0.70341295, + "learning_rate": 8.405407657144125e-08, + "loss": 0.72468925, + "num_input_tokens_seen": 163654925, + "step": 7569, + "time_per_iteration": 2.5462863445281982 + }, + { + "auxiliary_loss_clip": 0.01132208, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.04384458, + "balance_loss_mlp": 1.016011, + "epoch": 0.9102386821379186, + "flos": 24752614919040.0, + "grad_norm": 1.688640509720927, + "language_loss": 0.72256434, + "learning_rate": 8.383076725493232e-08, + "loss": 0.74411786, + "num_input_tokens_seen": 163672245, + "step": 7570, + "time_per_iteration": 3.277507781982422 + }, + { + "auxiliary_loss_clip": 0.01152124, + "auxiliary_loss_mlp": 0.01019254, + "balance_loss_clip": 1.04558885, + "balance_loss_mlp": 1.0124532, + "epoch": 0.9103589250285576, + "flos": 22562818179840.0, + "grad_norm": 1.8136145324130062, + "language_loss": 0.67824489, + "learning_rate": 8.360774861977216e-08, + "loss": 0.69995862, + "num_input_tokens_seen": 163691365, + "step": 7571, + "time_per_iteration": 2.4458084106445312 + }, + { + "auxiliary_loss_clip": 0.01135282, + "auxiliary_loss_mlp": 0.01020508, + "balance_loss_clip": 1.04011559, + "balance_loss_mlp": 1.01348901, + "epoch": 0.9104791679191968, + "flos": 25373474524800.0, + "grad_norm": 2.718233018534829, + "language_loss": 0.74644589, + "learning_rate": 8.338502069979281e-08, + "loss": 0.76800382, + "num_input_tokens_seen": 163711675, + "step": 7572, + "time_per_iteration": 2.529071807861328 + }, + { + "auxiliary_loss_clip": 0.01153382, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.04505289, + "balance_loss_mlp": 1.01723576, + "epoch": 0.9105994108098359, + "flos": 14426681558400.0, + "grad_norm": 4.686721507726226, + "language_loss": 0.80009401, + "learning_rate": 8.316258352878214e-08, + "loss": 0.82187378, + "num_input_tokens_seen": 163728095, + "step": 7573, + "time_per_iteration": 2.409842014312744 + }, + { + "auxiliary_loss_clip": 0.01155798, + "auxiliary_loss_mlp": 0.0102672, + "balance_loss_clip": 1.04461288, + "balance_loss_mlp": 1.01912093, + "epoch": 0.9107196537004749, + "flos": 26718292748160.0, + "grad_norm": 1.8767521077486224, + "language_loss": 0.71043646, + "learning_rate": 8.294043714048338e-08, + "loss": 0.73226166, + "num_input_tokens_seen": 163747175, + "step": 7574, + "time_per_iteration": 2.5055599212646484 + }, + { + "auxiliary_loss_clip": 0.01041189, + "auxiliary_loss_mlp": 0.01000876, + "balance_loss_clip": 1.00678039, + "balance_loss_mlp": 0.99991018, + "epoch": 0.9108398965911141, + "flos": 66532634703360.0, + "grad_norm": 0.7496199689679823, + "language_loss": 0.60471404, + "learning_rate": 8.271858156859624e-08, + "loss": 0.62513471, + "num_input_tokens_seen": 163812545, + "step": 7575, + "time_per_iteration": 3.124560594558716 + }, + { + "auxiliary_loss_clip": 0.01162867, + "auxiliary_loss_mlp": 0.01018015, + "balance_loss_clip": 1.04664505, + "balance_loss_mlp": 1.010988, + "epoch": 0.9109601394817531, + "flos": 25411073086080.0, + "grad_norm": 1.591850634327226, + "language_loss": 0.73872173, + "learning_rate": 8.249701684677557e-08, + "loss": 0.76053047, + "num_input_tokens_seen": 163833870, + "step": 7576, + "time_per_iteration": 2.4611191749572754 + }, + { + "auxiliary_loss_clip": 0.01151638, + "auxiliary_loss_mlp": 0.0102314, + "balance_loss_clip": 1.04795575, + "balance_loss_mlp": 1.01612711, + "epoch": 0.9110803823723922, + "flos": 22747794243840.0, + "grad_norm": 1.786722172825009, + "language_loss": 0.81441593, + "learning_rate": 8.227574300863294e-08, + "loss": 0.83616364, + "num_input_tokens_seen": 163854040, + "step": 7577, + "time_per_iteration": 2.471400260925293 + }, + { + "auxiliary_loss_clip": 0.01141203, + "auxiliary_loss_mlp": 0.01021569, + "balance_loss_clip": 1.04657054, + "balance_loss_mlp": 1.01424098, + "epoch": 0.9112006252630314, + "flos": 48469924131840.0, + "grad_norm": 1.819111450387425, + "language_loss": 0.69685501, + "learning_rate": 8.205476008773548e-08, + "loss": 0.71848273, + "num_input_tokens_seen": 163878040, + "step": 7578, + "time_per_iteration": 2.7352585792541504 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.04291379, + "balance_loss_mlp": 1.01962113, + "epoch": 0.9113208681536704, + "flos": 30009649829760.0, + "grad_norm": 2.2898339699230763, + "language_loss": 0.82572925, + "learning_rate": 8.183406811760596e-08, + "loss": 0.84715581, + "num_input_tokens_seen": 163897770, + "step": 7579, + "time_per_iteration": 2.6108853816986084 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01019578, + "balance_loss_clip": 1.04013908, + "balance_loss_mlp": 1.01264334, + "epoch": 0.9114411110443095, + "flos": 25594971742080.0, + "grad_norm": 1.9316176250182733, + "language_loss": 0.74161077, + "learning_rate": 8.161366713172313e-08, + "loss": 0.76291674, + "num_input_tokens_seen": 163920160, + "step": 7580, + "time_per_iteration": 2.600269079208374 + }, + { + "auxiliary_loss_clip": 0.01128745, + "auxiliary_loss_mlp": 0.01025573, + "balance_loss_clip": 1.04281259, + "balance_loss_mlp": 1.01824141, + "epoch": 0.9115613539349486, + "flos": 18399729928320.0, + "grad_norm": 2.692894319953287, + "language_loss": 0.84322506, + "learning_rate": 8.139355716352137e-08, + "loss": 0.86476827, + "num_input_tokens_seen": 163935000, + "step": 7581, + "time_per_iteration": 2.497633695602417 + }, + { + "auxiliary_loss_clip": 0.01139455, + "auxiliary_loss_mlp": 0.01026491, + "balance_loss_clip": 1.04306817, + "balance_loss_mlp": 1.01909161, + "epoch": 0.9116815968255877, + "flos": 21726171619200.0, + "grad_norm": 2.2415526055878705, + "language_loss": 0.70173478, + "learning_rate": 8.117373824639196e-08, + "loss": 0.72339422, + "num_input_tokens_seen": 163955265, + "step": 7582, + "time_per_iteration": 2.486483573913574 + }, + { + "auxiliary_loss_clip": 0.01060359, + "auxiliary_loss_mlp": 0.01000657, + "balance_loss_clip": 1.00708246, + "balance_loss_mlp": 0.99976271, + "epoch": 0.9118018397162267, + "flos": 65363526835200.0, + "grad_norm": 0.7236309186387634, + "language_loss": 0.59293932, + "learning_rate": 8.095421041368067e-08, + "loss": 0.61354953, + "num_input_tokens_seen": 164014680, + "step": 7583, + "time_per_iteration": 2.9243624210357666 + }, + { + "auxiliary_loss_clip": 0.01134847, + "auxiliary_loss_mlp": 0.00761967, + "balance_loss_clip": 1.04453897, + "balance_loss_mlp": 1.00047135, + "epoch": 0.9119220826068659, + "flos": 20922885815040.0, + "grad_norm": 2.65117118910255, + "language_loss": 0.70748264, + "learning_rate": 8.073497369868999e-08, + "loss": 0.72645074, + "num_input_tokens_seen": 164033140, + "step": 7584, + "time_per_iteration": 2.4762933254241943 + }, + { + "auxiliary_loss_clip": 0.0114606, + "auxiliary_loss_mlp": 0.01023689, + "balance_loss_clip": 1.04572368, + "balance_loss_mlp": 1.0164355, + "epoch": 0.912042325497505, + "flos": 28366449327360.0, + "grad_norm": 2.0049272295103755, + "language_loss": 0.75278604, + "learning_rate": 8.051602813467772e-08, + "loss": 0.77448344, + "num_input_tokens_seen": 164054995, + "step": 7585, + "time_per_iteration": 2.5583436489105225 + }, + { + "auxiliary_loss_clip": 0.01155188, + "auxiliary_loss_mlp": 0.01022027, + "balance_loss_clip": 1.04712009, + "balance_loss_mlp": 1.01530313, + "epoch": 0.912162568388144, + "flos": 17566782468480.0, + "grad_norm": 5.615056663456039, + "language_loss": 0.71047747, + "learning_rate": 8.029737375485756e-08, + "loss": 0.73224968, + "num_input_tokens_seen": 164074225, + "step": 7586, + "time_per_iteration": 2.4545328617095947 + }, + { + "auxiliary_loss_clip": 0.01165329, + "auxiliary_loss_mlp": 0.01022721, + "balance_loss_clip": 1.047382, + "balance_loss_mlp": 1.01607215, + "epoch": 0.9122828112787832, + "flos": 19827897661440.0, + "grad_norm": 1.737784491086537, + "language_loss": 0.72438973, + "learning_rate": 8.007901059239986e-08, + "loss": 0.7462703, + "num_input_tokens_seen": 164093505, + "step": 7587, + "time_per_iteration": 3.2481539249420166 + }, + { + "auxiliary_loss_clip": 0.01136636, + "auxiliary_loss_mlp": 0.01020159, + "balance_loss_clip": 1.04129672, + "balance_loss_mlp": 1.01310825, + "epoch": 0.9124030541694222, + "flos": 20813789232000.0, + "grad_norm": 1.70199400515549, + "language_loss": 0.80106831, + "learning_rate": 7.986093868042964e-08, + "loss": 0.82263625, + "num_input_tokens_seen": 164113750, + "step": 7588, + "time_per_iteration": 2.4724841117858887 + }, + { + "auxiliary_loss_clip": 0.0114988, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.04561639, + "balance_loss_mlp": 1.02177155, + "epoch": 0.9125232970600613, + "flos": 25192305302400.0, + "grad_norm": 2.0762520832926286, + "language_loss": 0.67812824, + "learning_rate": 7.964315805202826e-08, + "loss": 0.69991308, + "num_input_tokens_seen": 164134330, + "step": 7589, + "time_per_iteration": 3.219475269317627 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01022026, + "balance_loss_clip": 1.04485464, + "balance_loss_mlp": 1.01427102, + "epoch": 0.9126435399507005, + "flos": 19719591177600.0, + "grad_norm": 1.867345436542811, + "language_loss": 0.72886568, + "learning_rate": 7.942566874023304e-08, + "loss": 0.75045466, + "num_input_tokens_seen": 164153515, + "step": 7590, + "time_per_iteration": 3.209393262863159 + }, + { + "auxiliary_loss_clip": 0.01132917, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.0413537, + "balance_loss_mlp": 1.01860976, + "epoch": 0.9127637828413395, + "flos": 19573614305280.0, + "grad_norm": 2.0016724650280557, + "language_loss": 0.69446194, + "learning_rate": 7.920847077803649e-08, + "loss": 0.71604991, + "num_input_tokens_seen": 164171305, + "step": 7591, + "time_per_iteration": 2.452043294906616 + }, + { + "auxiliary_loss_clip": 0.01097696, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.03641629, + "balance_loss_mlp": 1.01745701, + "epoch": 0.9128840257319786, + "flos": 20230635928320.0, + "grad_norm": 2.003938756046427, + "language_loss": 0.82037741, + "learning_rate": 7.899156419838826e-08, + "loss": 0.84159881, + "num_input_tokens_seen": 164190275, + "step": 7592, + "time_per_iteration": 2.545992612838745 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01023781, + "balance_loss_clip": 1.04175746, + "balance_loss_mlp": 1.01712632, + "epoch": 0.9130042686226177, + "flos": 24858658846080.0, + "grad_norm": 1.837050280725676, + "language_loss": 0.65535933, + "learning_rate": 7.87749490341918e-08, + "loss": 0.67679459, + "num_input_tokens_seen": 164210550, + "step": 7593, + "time_per_iteration": 2.547178268432617 + }, + { + "auxiliary_loss_clip": 0.01168361, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.0488019, + "balance_loss_mlp": 1.01726437, + "epoch": 0.9131245115132568, + "flos": 23581747284480.0, + "grad_norm": 2.1144910283486245, + "language_loss": 0.8319335, + "learning_rate": 7.855862531830836e-08, + "loss": 0.85386288, + "num_input_tokens_seen": 164226660, + "step": 7594, + "time_per_iteration": 2.405717134475708 + }, + { + "auxiliary_loss_clip": 0.01148084, + "auxiliary_loss_mlp": 0.01021432, + "balance_loss_clip": 1.04354155, + "balance_loss_mlp": 1.01450872, + "epoch": 0.9132447544038959, + "flos": 19931607204480.0, + "grad_norm": 1.5790664591105037, + "language_loss": 0.72917855, + "learning_rate": 7.834259308355373e-08, + "loss": 0.75087368, + "num_input_tokens_seen": 164245425, + "step": 7595, + "time_per_iteration": 2.4460058212280273 + }, + { + "auxiliary_loss_clip": 0.01078176, + "auxiliary_loss_mlp": 0.01024884, + "balance_loss_clip": 1.03630245, + "balance_loss_mlp": 1.01792192, + "epoch": 0.9133649972945349, + "flos": 21981747864960.0, + "grad_norm": 2.0350844150389253, + "language_loss": 0.75049615, + "learning_rate": 7.812685236269989e-08, + "loss": 0.77152681, + "num_input_tokens_seen": 164264085, + "step": 7596, + "time_per_iteration": 2.6115055084228516 + }, + { + "auxiliary_loss_clip": 0.01026284, + "auxiliary_loss_mlp": 0.01002656, + "balance_loss_clip": 1.00898743, + "balance_loss_mlp": 1.00182152, + "epoch": 0.9134852401851741, + "flos": 71240523511680.0, + "grad_norm": 0.7881695381492786, + "language_loss": 0.58615732, + "learning_rate": 7.791140318847445e-08, + "loss": 0.60644674, + "num_input_tokens_seen": 164322220, + "step": 7597, + "time_per_iteration": 3.8404805660247803 + }, + { + "auxiliary_loss_clip": 0.01134291, + "auxiliary_loss_mlp": 0.01019556, + "balance_loss_clip": 1.04687822, + "balance_loss_mlp": 1.01305056, + "epoch": 0.9136054830758131, + "flos": 23626923615360.0, + "grad_norm": 1.4800515167768031, + "language_loss": 0.80263603, + "learning_rate": 7.769624559356081e-08, + "loss": 0.82417446, + "num_input_tokens_seen": 164345615, + "step": 7598, + "time_per_iteration": 2.5635030269622803 + }, + { + "auxiliary_loss_clip": 0.01149581, + "auxiliary_loss_mlp": 0.01024296, + "balance_loss_clip": 1.04385662, + "balance_loss_mlp": 1.01632106, + "epoch": 0.9137257259664522, + "flos": 23438858981760.0, + "grad_norm": 3.999870129167285, + "language_loss": 0.75410414, + "learning_rate": 7.748137961059842e-08, + "loss": 0.77584291, + "num_input_tokens_seen": 164359595, + "step": 7599, + "time_per_iteration": 2.4458670616149902 + }, + { + "auxiliary_loss_clip": 0.0116198, + "auxiliary_loss_mlp": 0.01024343, + "balance_loss_clip": 1.04683089, + "balance_loss_mlp": 1.01709819, + "epoch": 0.9138459688570914, + "flos": 19127854523520.0, + "grad_norm": 2.5979285564238825, + "language_loss": 0.65739489, + "learning_rate": 7.726680527218211e-08, + "loss": 0.67925811, + "num_input_tokens_seen": 164376635, + "step": 7600, + "time_per_iteration": 2.3873424530029297 + }, + { + "auxiliary_loss_clip": 0.01164609, + "auxiliary_loss_mlp": 0.01022347, + "balance_loss_clip": 1.04517794, + "balance_loss_mlp": 1.01536131, + "epoch": 0.9139662117477304, + "flos": 46281240714240.0, + "grad_norm": 1.6979897915311042, + "language_loss": 0.75865489, + "learning_rate": 7.70525226108627e-08, + "loss": 0.78052443, + "num_input_tokens_seen": 164400305, + "step": 7601, + "time_per_iteration": 2.619725465774536 + }, + { + "auxiliary_loss_clip": 0.01152321, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.04807508, + "balance_loss_mlp": 1.01919091, + "epoch": 0.9140864546383695, + "flos": 22273198819200.0, + "grad_norm": 2.168382931186185, + "language_loss": 0.79799485, + "learning_rate": 7.683853165914666e-08, + "loss": 0.81977934, + "num_input_tokens_seen": 164418075, + "step": 7602, + "time_per_iteration": 2.4384236335754395 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01025256, + "balance_loss_clip": 1.04312313, + "balance_loss_mlp": 1.01855612, + "epoch": 0.9142066975290086, + "flos": 17530009920000.0, + "grad_norm": 1.9916114291657985, + "language_loss": 0.77349365, + "learning_rate": 7.662483244949602e-08, + "loss": 0.79485315, + "num_input_tokens_seen": 164435335, + "step": 7603, + "time_per_iteration": 2.5221846103668213 + }, + { + "auxiliary_loss_clip": 0.01115678, + "auxiliary_loss_mlp": 0.01019379, + "balance_loss_clip": 1.04257107, + "balance_loss_mlp": 1.01239026, + "epoch": 0.9143269404196477, + "flos": 17712148809600.0, + "grad_norm": 2.980415360896527, + "language_loss": 0.80810434, + "learning_rate": 7.641142501432951e-08, + "loss": 0.8294549, + "num_input_tokens_seen": 164451530, + "step": 7604, + "time_per_iteration": 2.4473040103912354 + }, + { + "auxiliary_loss_clip": 0.01131348, + "auxiliary_loss_mlp": 0.01023347, + "balance_loss_clip": 1.04267859, + "balance_loss_mlp": 1.01647472, + "epoch": 0.9144471833102867, + "flos": 33323414019840.0, + "grad_norm": 1.8313251872224725, + "language_loss": 0.73798144, + "learning_rate": 7.619830938602013e-08, + "loss": 0.7595284, + "num_input_tokens_seen": 164472755, + "step": 7605, + "time_per_iteration": 2.581111431121826 + }, + { + "auxiliary_loss_clip": 0.01145574, + "auxiliary_loss_mlp": 0.01022238, + "balance_loss_clip": 1.04453278, + "balance_loss_mlp": 1.01508856, + "epoch": 0.9145674262009259, + "flos": 21068970428160.0, + "grad_norm": 2.413515252292556, + "language_loss": 0.82065606, + "learning_rate": 7.598548559689777e-08, + "loss": 0.84233415, + "num_input_tokens_seen": 164491155, + "step": 7606, + "time_per_iteration": 2.441040277481079 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01022258, + "balance_loss_clip": 1.04175425, + "balance_loss_mlp": 1.01544833, + "epoch": 0.914687669091565, + "flos": 16800269212800.0, + "grad_norm": 6.175178509466248, + "language_loss": 0.80802983, + "learning_rate": 7.577295367924751e-08, + "loss": 0.82942563, + "num_input_tokens_seen": 164507555, + "step": 7607, + "time_per_iteration": 2.474886655807495 + }, + { + "auxiliary_loss_clip": 0.01141552, + "auxiliary_loss_mlp": 0.0102126, + "balance_loss_clip": 1.04582965, + "balance_loss_mlp": 1.0139823, + "epoch": 0.914807911982204, + "flos": 25773627012480.0, + "grad_norm": 1.7403130139530418, + "language_loss": 0.82291836, + "learning_rate": 7.556071366531002e-08, + "loss": 0.8445465, + "num_input_tokens_seen": 164528525, + "step": 7608, + "time_per_iteration": 2.517787218093872 + }, + { + "auxiliary_loss_clip": 0.01150925, + "auxiliary_loss_mlp": 0.01024437, + "balance_loss_clip": 1.04643726, + "balance_loss_mlp": 1.01704049, + "epoch": 0.9149281548728432, + "flos": 19208043636480.0, + "grad_norm": 4.017934441258296, + "language_loss": 0.79343176, + "learning_rate": 7.53487655872822e-08, + "loss": 0.81518531, + "num_input_tokens_seen": 164547695, + "step": 7609, + "time_per_iteration": 2.4374399185180664 + }, + { + "auxiliary_loss_clip": 0.01110623, + "auxiliary_loss_mlp": 0.01022692, + "balance_loss_clip": 1.03855669, + "balance_loss_mlp": 1.01514578, + "epoch": 0.9150483977634822, + "flos": 26870554500480.0, + "grad_norm": 1.8176322488121996, + "language_loss": 0.73922813, + "learning_rate": 7.513710947731656e-08, + "loss": 0.76056129, + "num_input_tokens_seen": 164568905, + "step": 7610, + "time_per_iteration": 2.56787109375 + }, + { + "auxiliary_loss_clip": 0.01129543, + "auxiliary_loss_mlp": 0.01024964, + "balance_loss_clip": 1.04334033, + "balance_loss_mlp": 1.01781785, + "epoch": 0.9151686406541213, + "flos": 21908956953600.0, + "grad_norm": 1.950405755254037, + "language_loss": 0.85082835, + "learning_rate": 7.492574536752095e-08, + "loss": 0.87237346, + "num_input_tokens_seen": 164588895, + "step": 7611, + "time_per_iteration": 2.4774608612060547 + }, + { + "auxiliary_loss_clip": 0.01136622, + "auxiliary_loss_mlp": 0.0102584, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.01925111, + "epoch": 0.9152888835447605, + "flos": 27308556944640.0, + "grad_norm": 1.9405805394408722, + "language_loss": 0.78302896, + "learning_rate": 7.471467328995907e-08, + "loss": 0.80465358, + "num_input_tokens_seen": 164607705, + "step": 7612, + "time_per_iteration": 2.5159993171691895 + }, + { + "auxiliary_loss_clip": 0.0108466, + "auxiliary_loss_mlp": 0.0102145, + "balance_loss_clip": 1.0383029, + "balance_loss_mlp": 1.01409447, + "epoch": 0.9154091264353995, + "flos": 13370728510080.0, + "grad_norm": 2.288525583068321, + "language_loss": 0.6045481, + "learning_rate": 7.450389327665018e-08, + "loss": 0.62560916, + "num_input_tokens_seen": 164625540, + "step": 7613, + "time_per_iteration": 2.595487117767334 + }, + { + "auxiliary_loss_clip": 0.01127667, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.0492301, + "balance_loss_mlp": 1.01963925, + "epoch": 0.9155293693260386, + "flos": 20193037367040.0, + "grad_norm": 3.6124289286554423, + "language_loss": 0.67764872, + "learning_rate": 7.429340535957029e-08, + "loss": 0.6991955, + "num_input_tokens_seen": 164640735, + "step": 7614, + "time_per_iteration": 3.337618589401245 + }, + { + "auxiliary_loss_clip": 0.01136766, + "auxiliary_loss_mlp": 0.01025526, + "balance_loss_clip": 1.04304767, + "balance_loss_mlp": 1.01893091, + "epoch": 0.9156496122166777, + "flos": 19354990176000.0, + "grad_norm": 2.188030475064479, + "language_loss": 0.70678705, + "learning_rate": 7.40832095706494e-08, + "loss": 0.72841001, + "num_input_tokens_seen": 164657430, + "step": 7615, + "time_per_iteration": 3.2225139141082764 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.04544044, + "balance_loss_mlp": 1.02209473, + "epoch": 0.9157698551073168, + "flos": 21107287261440.0, + "grad_norm": 1.8147881568414153, + "language_loss": 0.79950982, + "learning_rate": 7.387330594177443e-08, + "loss": 0.82109416, + "num_input_tokens_seen": 164679505, + "step": 7616, + "time_per_iteration": 3.3735921382904053 + }, + { + "auxiliary_loss_clip": 0.01118296, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.04283071, + "balance_loss_mlp": 1.02089691, + "epoch": 0.9158900979979558, + "flos": 25193167228800.0, + "grad_norm": 1.6284707456835912, + "language_loss": 0.79017949, + "learning_rate": 7.366369450478749e-08, + "loss": 0.81163836, + "num_input_tokens_seen": 164700615, + "step": 7617, + "time_per_iteration": 2.54510235786438 + }, + { + "auxiliary_loss_clip": 0.01118042, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.04187059, + "balance_loss_mlp": 1.01980066, + "epoch": 0.916010340888595, + "flos": 30146648302080.0, + "grad_norm": 1.5856731105762434, + "language_loss": 0.66581762, + "learning_rate": 7.345437529148646e-08, + "loss": 0.68726575, + "num_input_tokens_seen": 164719625, + "step": 7618, + "time_per_iteration": 2.5736329555511475 + }, + { + "auxiliary_loss_clip": 0.01121513, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.04237258, + "balance_loss_mlp": 1.02339554, + "epoch": 0.9161305837792341, + "flos": 17091827907840.0, + "grad_norm": 1.8377405237655395, + "language_loss": 0.72814631, + "learning_rate": 7.324534833362483e-08, + "loss": 0.74966586, + "num_input_tokens_seen": 164737200, + "step": 7619, + "time_per_iteration": 2.4841952323913574 + }, + { + "auxiliary_loss_clip": 0.01137303, + "auxiliary_loss_mlp": 0.01023393, + "balance_loss_clip": 1.04526663, + "balance_loss_mlp": 1.01624596, + "epoch": 0.9162508266698731, + "flos": 22893699288960.0, + "grad_norm": 1.78582716112665, + "language_loss": 0.68454885, + "learning_rate": 7.303661366291192e-08, + "loss": 0.70615578, + "num_input_tokens_seen": 164757870, + "step": 7620, + "time_per_iteration": 2.5334620475769043 + }, + { + "auxiliary_loss_clip": 0.01107017, + "auxiliary_loss_mlp": 0.01022593, + "balance_loss_clip": 1.04087758, + "balance_loss_mlp": 1.0158217, + "epoch": 0.9163710695605123, + "flos": 19974808287360.0, + "grad_norm": 1.7482009389320294, + "language_loss": 0.81431764, + "learning_rate": 7.28281713110126e-08, + "loss": 0.83561373, + "num_input_tokens_seen": 164775945, + "step": 7621, + "time_per_iteration": 2.546293258666992 + }, + { + "auxiliary_loss_clip": 0.01134049, + "auxiliary_loss_mlp": 0.01024958, + "balance_loss_clip": 1.0453943, + "balance_loss_mlp": 1.0179249, + "epoch": 0.9164913124511513, + "flos": 22783812606720.0, + "grad_norm": 1.9266749360005215, + "language_loss": 0.77157038, + "learning_rate": 7.262002130954759e-08, + "loss": 0.79316044, + "num_input_tokens_seen": 164794400, + "step": 7622, + "time_per_iteration": 2.507571220397949 + }, + { + "auxiliary_loss_clip": 0.01113115, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.04219127, + "balance_loss_mlp": 1.01950264, + "epoch": 0.9166115553417904, + "flos": 24900854348160.0, + "grad_norm": 1.6920187560768574, + "language_loss": 0.78658438, + "learning_rate": 7.241216369009296e-08, + "loss": 0.80798596, + "num_input_tokens_seen": 164814585, + "step": 7623, + "time_per_iteration": 3.3467843532562256 + }, + { + "auxiliary_loss_clip": 0.01162854, + "auxiliary_loss_mlp": 0.010208, + "balance_loss_clip": 1.0452739, + "balance_loss_mlp": 1.01393938, + "epoch": 0.9167317982324296, + "flos": 25702919089920.0, + "grad_norm": 2.8370637099199048, + "language_loss": 0.66368234, + "learning_rate": 7.220459848418037e-08, + "loss": 0.68551886, + "num_input_tokens_seen": 164834660, + "step": 7624, + "time_per_iteration": 2.4610095024108887 + }, + { + "auxiliary_loss_clip": 0.0116212, + "auxiliary_loss_mlp": 0.01021668, + "balance_loss_clip": 1.04705691, + "balance_loss_mlp": 1.01495612, + "epoch": 0.9168520411230686, + "flos": 15632813370240.0, + "grad_norm": 1.6681159869818696, + "language_loss": 0.79513001, + "learning_rate": 7.199732572329708e-08, + "loss": 0.8169679, + "num_input_tokens_seen": 164852560, + "step": 7625, + "time_per_iteration": 2.41252064704895 + }, + { + "auxiliary_loss_clip": 0.01126649, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.04299486, + "balance_loss_mlp": 1.02026534, + "epoch": 0.9169722840137077, + "flos": 30258151096320.0, + "grad_norm": 2.2095812584423653, + "language_loss": 0.75729418, + "learning_rate": 7.179034543888684e-08, + "loss": 0.77883625, + "num_input_tokens_seen": 164872065, + "step": 7626, + "time_per_iteration": 2.5752599239349365 + }, + { + "auxiliary_loss_clip": 0.011522, + "auxiliary_loss_mlp": 0.01025421, + "balance_loss_clip": 1.04423654, + "balance_loss_mlp": 1.01856041, + "epoch": 0.9170925269043467, + "flos": 22491643380480.0, + "grad_norm": 2.582222675900588, + "language_loss": 0.77464998, + "learning_rate": 7.158365766234808e-08, + "loss": 0.79642618, + "num_input_tokens_seen": 164890915, + "step": 7627, + "time_per_iteration": 2.4673123359680176 + }, + { + "auxiliary_loss_clip": 0.01113003, + "auxiliary_loss_mlp": 0.0102329, + "balance_loss_clip": 1.03824651, + "balance_loss_mlp": 1.0156486, + "epoch": 0.9172127697949859, + "flos": 22893914770560.0, + "grad_norm": 1.9622509135071753, + "language_loss": 0.72362083, + "learning_rate": 7.137726242503527e-08, + "loss": 0.74498373, + "num_input_tokens_seen": 164909835, + "step": 7628, + "time_per_iteration": 2.513026714324951 + }, + { + "auxiliary_loss_clip": 0.01149513, + "auxiliary_loss_mlp": 0.00762049, + "balance_loss_clip": 1.04580176, + "balance_loss_mlp": 1.00047135, + "epoch": 0.917333012685625, + "flos": 17451867882240.0, + "grad_norm": 2.3189796566703813, + "language_loss": 0.77865136, + "learning_rate": 7.11711597582585e-08, + "loss": 0.79776692, + "num_input_tokens_seen": 164927195, + "step": 7629, + "time_per_iteration": 2.4811604022979736 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01020991, + "balance_loss_clip": 1.03957272, + "balance_loss_mlp": 1.01472926, + "epoch": 0.917453255576264, + "flos": 14318949692160.0, + "grad_norm": 1.819354923664696, + "language_loss": 0.79972935, + "learning_rate": 7.096534969328271e-08, + "loss": 0.82115358, + "num_input_tokens_seen": 164944640, + "step": 7630, + "time_per_iteration": 2.476349115371704 + }, + { + "auxiliary_loss_clip": 0.01140394, + "auxiliary_loss_mlp": 0.01021659, + "balance_loss_clip": 1.04276109, + "balance_loss_mlp": 1.01481354, + "epoch": 0.9175734984669032, + "flos": 20741177888640.0, + "grad_norm": 2.535046132391288, + "language_loss": 0.83909428, + "learning_rate": 7.075983226132987e-08, + "loss": 0.86071479, + "num_input_tokens_seen": 164963570, + "step": 7631, + "time_per_iteration": 2.4864015579223633 + }, + { + "auxiliary_loss_clip": 0.01139387, + "auxiliary_loss_mlp": 0.0076235, + "balance_loss_clip": 1.04189718, + "balance_loss_mlp": 1.00040793, + "epoch": 0.9176937413575422, + "flos": 14830497233280.0, + "grad_norm": 2.752170995558212, + "language_loss": 0.7917909, + "learning_rate": 7.055460749357656e-08, + "loss": 0.8108083, + "num_input_tokens_seen": 164979850, + "step": 7632, + "time_per_iteration": 2.43985915184021 + }, + { + "auxiliary_loss_clip": 0.01137293, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.04539037, + "balance_loss_mlp": 1.0190413, + "epoch": 0.9178139842481813, + "flos": 18474603828480.0, + "grad_norm": 1.6268610325364654, + "language_loss": 0.70351678, + "learning_rate": 7.034967542115521e-08, + "loss": 0.7251544, + "num_input_tokens_seen": 164998115, + "step": 7633, + "time_per_iteration": 2.4819447994232178 + }, + { + "auxiliary_loss_clip": 0.01140377, + "auxiliary_loss_mlp": 0.00761638, + "balance_loss_clip": 1.04300833, + "balance_loss_mlp": 1.00050187, + "epoch": 0.9179342271388204, + "flos": 20047455544320.0, + "grad_norm": 2.1005244022234226, + "language_loss": 0.75454772, + "learning_rate": 7.014503607515388e-08, + "loss": 0.77356791, + "num_input_tokens_seen": 165017420, + "step": 7634, + "time_per_iteration": 2.4153895378112793 + }, + { + "auxiliary_loss_clip": 0.01137467, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.04749775, + "balance_loss_mlp": 1.02157021, + "epoch": 0.9180544700294595, + "flos": 24676232647680.0, + "grad_norm": 2.333512025323974, + "language_loss": 0.68532926, + "learning_rate": 6.994068948661592e-08, + "loss": 0.70698977, + "num_input_tokens_seen": 165035575, + "step": 7635, + "time_per_iteration": 2.519930362701416 + }, + { + "auxiliary_loss_clip": 0.01150302, + "auxiliary_loss_mlp": 0.01024838, + "balance_loss_clip": 1.04553819, + "balance_loss_mlp": 1.01688635, + "epoch": 0.9181747129200986, + "flos": 16727478301440.0, + "grad_norm": 2.31083351932169, + "language_loss": 0.7637673, + "learning_rate": 6.973663568654142e-08, + "loss": 0.78551865, + "num_input_tokens_seen": 165053280, + "step": 7636, + "time_per_iteration": 2.4259276390075684 + }, + { + "auxiliary_loss_clip": 0.01163957, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.0475173, + "balance_loss_mlp": 1.02247739, + "epoch": 0.9182949558107377, + "flos": 24271626873600.0, + "grad_norm": 2.4403909420445866, + "language_loss": 0.65316081, + "learning_rate": 6.953287470588386e-08, + "loss": 0.67509669, + "num_input_tokens_seen": 165071235, + "step": 7637, + "time_per_iteration": 2.438974380493164 + }, + { + "auxiliary_loss_clip": 0.01154247, + "auxiliary_loss_mlp": 0.01024416, + "balance_loss_clip": 1.04480255, + "balance_loss_mlp": 1.01719177, + "epoch": 0.9184151987013768, + "flos": 22082117443200.0, + "grad_norm": 2.2356865436449294, + "language_loss": 0.86101222, + "learning_rate": 6.932940657555452e-08, + "loss": 0.88279891, + "num_input_tokens_seen": 165087365, + "step": 7638, + "time_per_iteration": 2.4101572036743164 + }, + { + "auxiliary_loss_clip": 0.01158337, + "auxiliary_loss_mlp": 0.01021286, + "balance_loss_clip": 1.04486382, + "balance_loss_mlp": 1.0150187, + "epoch": 0.9185354415920158, + "flos": 32166732257280.0, + "grad_norm": 1.3898957626803734, + "language_loss": 0.76542276, + "learning_rate": 6.912623132641938e-08, + "loss": 0.78721899, + "num_input_tokens_seen": 165112455, + "step": 7639, + "time_per_iteration": 2.5528690814971924 + }, + { + "auxiliary_loss_clip": 0.01138677, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.04500747, + "balance_loss_mlp": 1.01977324, + "epoch": 0.918655684482655, + "flos": 20997831542400.0, + "grad_norm": 1.7388803268277686, + "language_loss": 0.76233828, + "learning_rate": 6.892334898929952e-08, + "loss": 0.78399622, + "num_input_tokens_seen": 165132700, + "step": 7640, + "time_per_iteration": 3.331432819366455 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01024127, + "balance_loss_clip": 1.04361367, + "balance_loss_mlp": 1.01712942, + "epoch": 0.918775927373294, + "flos": 15560704817280.0, + "grad_norm": 4.224198701075811, + "language_loss": 0.84922671, + "learning_rate": 6.872075959497236e-08, + "loss": 0.87091327, + "num_input_tokens_seen": 165151475, + "step": 7641, + "time_per_iteration": 2.4294159412384033 + }, + { + "auxiliary_loss_clip": 0.01151841, + "auxiliary_loss_mlp": 0.0102423, + "balance_loss_clip": 1.04352629, + "balance_loss_mlp": 1.01765573, + "epoch": 0.9188961702639331, + "flos": 29934057657600.0, + "grad_norm": 1.834512383712307, + "language_loss": 0.82921076, + "learning_rate": 6.85184631741702e-08, + "loss": 0.85097146, + "num_input_tokens_seen": 165172040, + "step": 7642, + "time_per_iteration": 3.274141550064087 + }, + { + "auxiliary_loss_clip": 0.01149807, + "auxiliary_loss_mlp": 0.01022051, + "balance_loss_clip": 1.0444268, + "balance_loss_mlp": 1.01463604, + "epoch": 0.9190164131545723, + "flos": 20701244943360.0, + "grad_norm": 2.053465059566003, + "language_loss": 0.77197266, + "learning_rate": 6.831645975758161e-08, + "loss": 0.79369128, + "num_input_tokens_seen": 165189980, + "step": 7643, + "time_per_iteration": 3.158571243286133 + }, + { + "auxiliary_loss_clip": 0.01131273, + "auxiliary_loss_mlp": 0.01025614, + "balance_loss_clip": 1.04436064, + "balance_loss_mlp": 1.0187242, + "epoch": 0.9191366560452113, + "flos": 25629912696960.0, + "grad_norm": 1.8956614170926007, + "language_loss": 0.67449749, + "learning_rate": 6.811474937585026e-08, + "loss": 0.69606632, + "num_input_tokens_seen": 165209770, + "step": 7644, + "time_per_iteration": 2.4951059818267822 + }, + { + "auxiliary_loss_clip": 0.0111854, + "auxiliary_loss_mlp": 0.01022489, + "balance_loss_clip": 1.04157495, + "balance_loss_mlp": 1.01566386, + "epoch": 0.9192568989358504, + "flos": 21434325615360.0, + "grad_norm": 1.7229902207045182, + "language_loss": 0.79311526, + "learning_rate": 6.79133320595755e-08, + "loss": 0.81452554, + "num_input_tokens_seen": 165229690, + "step": 7645, + "time_per_iteration": 2.5088160037994385 + }, + { + "auxiliary_loss_clip": 0.01122449, + "auxiliary_loss_mlp": 0.00761736, + "balance_loss_clip": 1.04321325, + "balance_loss_mlp": 1.00047684, + "epoch": 0.9193771418264896, + "flos": 23185078416000.0, + "grad_norm": 1.6575532127398032, + "language_loss": 0.75533366, + "learning_rate": 6.771220783931198e-08, + "loss": 0.77417552, + "num_input_tokens_seen": 165249850, + "step": 7646, + "time_per_iteration": 2.5735199451446533 + }, + { + "auxiliary_loss_clip": 0.01003497, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.02024174, + "balance_loss_mlp": 1.00112808, + "epoch": 0.9194973847171286, + "flos": 70582963184640.0, + "grad_norm": 0.9037482616793509, + "language_loss": 0.64680159, + "learning_rate": 6.751137674556994e-08, + "loss": 0.66686285, + "num_input_tokens_seen": 165310235, + "step": 7647, + "time_per_iteration": 3.3206405639648438 + }, + { + "auxiliary_loss_clip": 0.01151934, + "auxiliary_loss_mlp": 0.01020698, + "balance_loss_clip": 1.04307282, + "balance_loss_mlp": 1.01377749, + "epoch": 0.9196176276077677, + "flos": 14720682378240.0, + "grad_norm": 2.176927481258253, + "language_loss": 0.77329636, + "learning_rate": 6.731083880881572e-08, + "loss": 0.79502273, + "num_input_tokens_seen": 165326455, + "step": 7648, + "time_per_iteration": 2.616607666015625 + }, + { + "auxiliary_loss_clip": 0.01136327, + "auxiliary_loss_mlp": 0.01021587, + "balance_loss_clip": 1.04371262, + "balance_loss_mlp": 1.01500916, + "epoch": 0.9197378704984068, + "flos": 23294893271040.0, + "grad_norm": 2.090158422838104, + "language_loss": 0.8082872, + "learning_rate": 6.711059405947072e-08, + "loss": 0.82986629, + "num_input_tokens_seen": 165344645, + "step": 7649, + "time_per_iteration": 2.494624137878418 + }, + { + "auxiliary_loss_clip": 0.01119597, + "auxiliary_loss_mlp": 0.01022311, + "balance_loss_clip": 1.0427438, + "balance_loss_mlp": 1.01512527, + "epoch": 0.9198581133890459, + "flos": 20302564913280.0, + "grad_norm": 2.082361382738563, + "language_loss": 0.77008837, + "learning_rate": 6.691064252791156e-08, + "loss": 0.79150748, + "num_input_tokens_seen": 165364120, + "step": 7650, + "time_per_iteration": 3.2656569480895996 + }, + { + "auxiliary_loss_clip": 0.01102726, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.0415051, + "balance_loss_mlp": 1.01702416, + "epoch": 0.9199783562796849, + "flos": 17675663569920.0, + "grad_norm": 1.6669751251756852, + "language_loss": 0.77895582, + "learning_rate": 6.67109842444713e-08, + "loss": 0.80022526, + "num_input_tokens_seen": 165383050, + "step": 7651, + "time_per_iteration": 2.512842893600464 + }, + { + "auxiliary_loss_clip": 0.01153345, + "auxiliary_loss_mlp": 0.00762696, + "balance_loss_clip": 1.05007184, + "balance_loss_mlp": 1.00045204, + "epoch": 0.9200985991703241, + "flos": 17676022705920.0, + "grad_norm": 1.991096165951223, + "language_loss": 0.76562107, + "learning_rate": 6.651161923943704e-08, + "loss": 0.7847814, + "num_input_tokens_seen": 165400955, + "step": 7652, + "time_per_iteration": 2.4329683780670166 + }, + { + "auxiliary_loss_clip": 0.01146242, + "auxiliary_loss_mlp": 0.01027349, + "balance_loss_clip": 1.04374027, + "balance_loss_mlp": 1.0198952, + "epoch": 0.9202188420609632, + "flos": 20996574566400.0, + "grad_norm": 2.1668702760713856, + "language_loss": 0.77489841, + "learning_rate": 6.631254754305326e-08, + "loss": 0.79663432, + "num_input_tokens_seen": 165420415, + "step": 7653, + "time_per_iteration": 2.442662477493286 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.0102357, + "balance_loss_clip": 1.04695463, + "balance_loss_mlp": 1.01641774, + "epoch": 0.9203390849516022, + "flos": 13918222586880.0, + "grad_norm": 2.497854279128939, + "language_loss": 0.77975565, + "learning_rate": 6.611376918551848e-08, + "loss": 0.80166227, + "num_input_tokens_seen": 165439200, + "step": 7654, + "time_per_iteration": 2.3864758014678955 + }, + { + "auxiliary_loss_clip": 0.01121223, + "auxiliary_loss_mlp": 0.00762089, + "balance_loss_clip": 1.041278, + "balance_loss_mlp": 1.00042582, + "epoch": 0.9204593278422414, + "flos": 21175912195200.0, + "grad_norm": 2.0443332873869093, + "language_loss": 0.79564846, + "learning_rate": 6.591528419698744e-08, + "loss": 0.81448162, + "num_input_tokens_seen": 165458985, + "step": 7655, + "time_per_iteration": 2.5202560424804688 + }, + { + "auxiliary_loss_clip": 0.01139449, + "auxiliary_loss_mlp": 0.01025383, + "balance_loss_clip": 1.04314375, + "balance_loss_mlp": 1.01906824, + "epoch": 0.9205795707328804, + "flos": 14501375890560.0, + "grad_norm": 2.36874502198062, + "language_loss": 0.83454204, + "learning_rate": 6.571709260756986e-08, + "loss": 0.85619038, + "num_input_tokens_seen": 165475630, + "step": 7656, + "time_per_iteration": 2.436110734939575 + }, + { + "auxiliary_loss_clip": 0.01155873, + "auxiliary_loss_mlp": 0.01028997, + "balance_loss_clip": 1.05001628, + "balance_loss_mlp": 1.02145672, + "epoch": 0.9206998136235195, + "flos": 22417559579520.0, + "grad_norm": 2.944361512186727, + "language_loss": 0.76342273, + "learning_rate": 6.551919444733122e-08, + "loss": 0.78527141, + "num_input_tokens_seen": 165493445, + "step": 7657, + "time_per_iteration": 2.427248954772949 + }, + { + "auxiliary_loss_clip": 0.01136374, + "auxiliary_loss_mlp": 0.01025271, + "balance_loss_clip": 1.04570186, + "balance_loss_mlp": 1.01783276, + "epoch": 0.9208200565141585, + "flos": 53358407544960.0, + "grad_norm": 3.6060789658169665, + "language_loss": 0.65963018, + "learning_rate": 6.53215897462931e-08, + "loss": 0.68124658, + "num_input_tokens_seen": 165517200, + "step": 7658, + "time_per_iteration": 2.7555553913116455 + }, + { + "auxiliary_loss_clip": 0.01148493, + "auxiliary_loss_mlp": 0.01027063, + "balance_loss_clip": 1.04461527, + "balance_loss_mlp": 1.01982403, + "epoch": 0.9209402994047977, + "flos": 30589139946240.0, + "grad_norm": 2.2499708108735508, + "language_loss": 0.74821383, + "learning_rate": 6.512427853443103e-08, + "loss": 0.76996934, + "num_input_tokens_seen": 165539280, + "step": 7659, + "time_per_iteration": 2.517777919769287 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01019071, + "balance_loss_clip": 1.04559326, + "balance_loss_mlp": 1.01173377, + "epoch": 0.9210605422954368, + "flos": 29132711187840.0, + "grad_norm": 1.6276872320456355, + "language_loss": 0.75560385, + "learning_rate": 6.492726084167799e-08, + "loss": 0.77732849, + "num_input_tokens_seen": 165561395, + "step": 7660, + "time_per_iteration": 2.506317138671875 + }, + { + "auxiliary_loss_clip": 0.01060779, + "auxiliary_loss_mlp": 0.0100092, + "balance_loss_clip": 1.00725389, + "balance_loss_mlp": 1.00006723, + "epoch": 0.9211807851860758, + "flos": 54853838472960.0, + "grad_norm": 0.7772790524315556, + "language_loss": 0.57487422, + "learning_rate": 6.473053669792072e-08, + "loss": 0.59549117, + "num_input_tokens_seen": 165616085, + "step": 7661, + "time_per_iteration": 2.8749241828918457 + }, + { + "auxiliary_loss_clip": 0.01148598, + "auxiliary_loss_mlp": 0.0102397, + "balance_loss_clip": 1.04341173, + "balance_loss_mlp": 1.01619124, + "epoch": 0.921301028076715, + "flos": 19201974238080.0, + "grad_norm": 6.1458698094312405, + "language_loss": 0.72845161, + "learning_rate": 6.453410613300248e-08, + "loss": 0.75017726, + "num_input_tokens_seen": 165634015, + "step": 7662, + "time_per_iteration": 2.4140779972076416 + }, + { + "auxiliary_loss_clip": 0.01094105, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.04023457, + "balance_loss_mlp": 1.01992822, + "epoch": 0.921421270967354, + "flos": 27526893765120.0, + "grad_norm": 1.671496699512302, + "language_loss": 0.58453703, + "learning_rate": 6.43379691767214e-08, + "loss": 0.60575104, + "num_input_tokens_seen": 165653220, + "step": 7663, + "time_per_iteration": 2.6053354740142822 + }, + { + "auxiliary_loss_clip": 0.01022273, + "auxiliary_loss_mlp": 0.01001594, + "balance_loss_clip": 1.0084753, + "balance_loss_mlp": 1.0007118, + "epoch": 0.9215415138579931, + "flos": 70209311955840.0, + "grad_norm": 0.7166928675433318, + "language_loss": 0.55127835, + "learning_rate": 6.414212585883105e-08, + "loss": 0.57151705, + "num_input_tokens_seen": 165715850, + "step": 7664, + "time_per_iteration": 3.1619186401367188 + }, + { + "auxiliary_loss_clip": 0.01140502, + "auxiliary_loss_mlp": 0.01020532, + "balance_loss_clip": 1.04497957, + "balance_loss_mlp": 1.01301932, + "epoch": 0.9216617567486323, + "flos": 35553107790720.0, + "grad_norm": 1.6033462203750817, + "language_loss": 0.7004081, + "learning_rate": 6.394657620904143e-08, + "loss": 0.72201842, + "num_input_tokens_seen": 165738960, + "step": 7665, + "time_per_iteration": 2.588714838027954 + }, + { + "auxiliary_loss_clip": 0.01169377, + "auxiliary_loss_mlp": 0.01026027, + "balance_loss_clip": 1.04820788, + "balance_loss_mlp": 1.01867187, + "epoch": 0.9217819996392713, + "flos": 29533330552320.0, + "grad_norm": 1.7149657571500336, + "language_loss": 0.72017533, + "learning_rate": 6.375132025701657e-08, + "loss": 0.74212939, + "num_input_tokens_seen": 165761260, + "step": 7666, + "time_per_iteration": 2.499300956726074 + }, + { + "auxiliary_loss_clip": 0.0116913, + "auxiliary_loss_mlp": 0.01024412, + "balance_loss_clip": 1.04929817, + "balance_loss_mlp": 1.01709533, + "epoch": 0.9219022425299104, + "flos": 14574669592320.0, + "grad_norm": 2.151956093164181, + "language_loss": 0.69253838, + "learning_rate": 6.355635803237724e-08, + "loss": 0.71447384, + "num_input_tokens_seen": 165776960, + "step": 7667, + "time_per_iteration": 3.2311604022979736 + }, + { + "auxiliary_loss_clip": 0.01148691, + "auxiliary_loss_mlp": 0.01024621, + "balance_loss_clip": 1.04342127, + "balance_loss_mlp": 1.01731324, + "epoch": 0.9220224854205495, + "flos": 18077503996800.0, + "grad_norm": 1.9775197839352552, + "language_loss": 0.79580146, + "learning_rate": 6.336168956469867e-08, + "loss": 0.81753457, + "num_input_tokens_seen": 165795435, + "step": 7668, + "time_per_iteration": 2.414461851119995 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.04336774, + "balance_loss_mlp": 1.01898789, + "epoch": 0.9221427283111886, + "flos": 24790464875520.0, + "grad_norm": 1.7744680438508844, + "language_loss": 0.72014236, + "learning_rate": 6.316731488351168e-08, + "loss": 0.74169326, + "num_input_tokens_seen": 165816625, + "step": 7669, + "time_per_iteration": 3.2863781452178955 + }, + { + "auxiliary_loss_clip": 0.01150566, + "auxiliary_loss_mlp": 0.01022543, + "balance_loss_clip": 1.04579663, + "balance_loss_mlp": 1.01539671, + "epoch": 0.9222629712018277, + "flos": 13845036625920.0, + "grad_norm": 1.793424297577179, + "language_loss": 0.635225, + "learning_rate": 6.297323401830334e-08, + "loss": 0.65695614, + "num_input_tokens_seen": 165835410, + "step": 7670, + "time_per_iteration": 3.237065315246582 + }, + { + "auxiliary_loss_clip": 0.01153341, + "auxiliary_loss_mlp": 0.0102241, + "balance_loss_clip": 1.04544127, + "balance_loss_mlp": 1.01544249, + "epoch": 0.9223832140924668, + "flos": 21616177196160.0, + "grad_norm": 2.8107892334190265, + "language_loss": 0.69219255, + "learning_rate": 6.277944699851523e-08, + "loss": 0.71395004, + "num_input_tokens_seen": 165854930, + "step": 7671, + "time_per_iteration": 2.4299607276916504 + }, + { + "auxiliary_loss_clip": 0.01162791, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.04550004, + "balance_loss_mlp": 1.01891541, + "epoch": 0.9225034569831059, + "flos": 21142084561920.0, + "grad_norm": 2.3558633985658775, + "language_loss": 0.73348355, + "learning_rate": 6.25859538535447e-08, + "loss": 0.75537121, + "num_input_tokens_seen": 165875725, + "step": 7672, + "time_per_iteration": 2.3961212635040283 + }, + { + "auxiliary_loss_clip": 0.01136135, + "auxiliary_loss_mlp": 0.01022535, + "balance_loss_clip": 1.04473019, + "balance_loss_mlp": 1.01548386, + "epoch": 0.9226236998737449, + "flos": 12495046844160.0, + "grad_norm": 2.4901182051713526, + "language_loss": 0.78354412, + "learning_rate": 6.239275461274474e-08, + "loss": 0.80513084, + "num_input_tokens_seen": 165892100, + "step": 7673, + "time_per_iteration": 2.426987886428833 + }, + { + "auxiliary_loss_clip": 0.01152056, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.04599643, + "balance_loss_mlp": 1.02035928, + "epoch": 0.9227439427643841, + "flos": 26214071581440.0, + "grad_norm": 1.9298177639148921, + "language_loss": 0.85972744, + "learning_rate": 6.219984930542299e-08, + "loss": 0.88151968, + "num_input_tokens_seen": 165912840, + "step": 7674, + "time_per_iteration": 2.4786314964294434 + }, + { + "auxiliary_loss_clip": 0.01152311, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.04498756, + "balance_loss_mlp": 1.02031684, + "epoch": 0.9228641856550232, + "flos": 17967581400960.0, + "grad_norm": 2.4581083088160924, + "language_loss": 0.75962865, + "learning_rate": 6.200723796084383e-08, + "loss": 0.78142297, + "num_input_tokens_seen": 165930935, + "step": 7675, + "time_per_iteration": 2.3962435722351074 + }, + { + "auxiliary_loss_clip": 0.01035324, + "auxiliary_loss_mlp": 0.01001526, + "balance_loss_clip": 1.00832105, + "balance_loss_mlp": 1.00054836, + "epoch": 0.9229844285456622, + "flos": 70420609710720.0, + "grad_norm": 0.7649790507259957, + "language_loss": 0.63101447, + "learning_rate": 6.181492060822546e-08, + "loss": 0.65138292, + "num_input_tokens_seen": 165991110, + "step": 7676, + "time_per_iteration": 2.991464138031006 + }, + { + "auxiliary_loss_clip": 0.01107256, + "auxiliary_loss_mlp": 0.0102243, + "balance_loss_clip": 1.04085457, + "balance_loss_mlp": 1.01518166, + "epoch": 0.9231046714363014, + "flos": 17967832796160.0, + "grad_norm": 2.1056106856434056, + "language_loss": 0.81498748, + "learning_rate": 6.162289727674274e-08, + "loss": 0.8362844, + "num_input_tokens_seen": 166008790, + "step": 7677, + "time_per_iteration": 3.2753589153289795 + }, + { + "auxiliary_loss_clip": 0.01122257, + "auxiliary_loss_mlp": 0.0102265, + "balance_loss_clip": 1.04182434, + "balance_loss_mlp": 1.01609945, + "epoch": 0.9232249143269404, + "flos": 17858233422720.0, + "grad_norm": 2.154044540981909, + "language_loss": 0.87877572, + "learning_rate": 6.143116799552527e-08, + "loss": 0.9002248, + "num_input_tokens_seen": 166025035, + "step": 7678, + "time_per_iteration": 2.524458885192871 + }, + { + "auxiliary_loss_clip": 0.01155175, + "auxiliary_loss_mlp": 0.01022785, + "balance_loss_clip": 1.04737461, + "balance_loss_mlp": 1.01561451, + "epoch": 0.9233451572175795, + "flos": 23404384903680.0, + "grad_norm": 2.3157741603411695, + "language_loss": 0.55908358, + "learning_rate": 6.123973279365802e-08, + "loss": 0.58086324, + "num_input_tokens_seen": 166044010, + "step": 7679, + "time_per_iteration": 2.468770742416382 + }, + { + "auxiliary_loss_clip": 0.01155081, + "auxiliary_loss_mlp": 0.01023463, + "balance_loss_clip": 1.04675853, + "balance_loss_mlp": 1.01700437, + "epoch": 0.9234654001082186, + "flos": 17999326045440.0, + "grad_norm": 2.0193275203489454, + "language_loss": 0.7778011, + "learning_rate": 6.10485917001824e-08, + "loss": 0.79958653, + "num_input_tokens_seen": 166061865, + "step": 7680, + "time_per_iteration": 2.422494649887085 + }, + { + "auxiliary_loss_clip": 0.0114014, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_clip": 1.04387641, + "balance_loss_mlp": 1.01465154, + "epoch": 0.9235856429988577, + "flos": 24750747411840.0, + "grad_norm": 1.6685948186289121, + "language_loss": 0.81034184, + "learning_rate": 6.085774474409322e-08, + "loss": 0.83195591, + "num_input_tokens_seen": 166082425, + "step": 7681, + "time_per_iteration": 2.503793716430664 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.04790807, + "balance_loss_mlp": 1.02164137, + "epoch": 0.9237058858894968, + "flos": 14099894599680.0, + "grad_norm": 2.1480405752097687, + "language_loss": 0.70128691, + "learning_rate": 6.066719195434267e-08, + "loss": 0.72296011, + "num_input_tokens_seen": 166100225, + "step": 7682, + "time_per_iteration": 2.4714064598083496 + }, + { + "auxiliary_loss_clip": 0.01153123, + "auxiliary_loss_mlp": 0.01025575, + "balance_loss_clip": 1.04660714, + "balance_loss_mlp": 1.01844883, + "epoch": 0.9238261287801359, + "flos": 28694529175680.0, + "grad_norm": 2.3167486633050283, + "language_loss": 0.66617912, + "learning_rate": 6.047693335983717e-08, + "loss": 0.68796605, + "num_input_tokens_seen": 166122570, + "step": 7683, + "time_per_iteration": 2.499300241470337 + }, + { + "auxiliary_loss_clip": 0.01153306, + "auxiliary_loss_mlp": 0.01023919, + "balance_loss_clip": 1.04448271, + "balance_loss_mlp": 1.01668024, + "epoch": 0.923946371670775, + "flos": 23111856541440.0, + "grad_norm": 2.5498969938233595, + "language_loss": 0.82253832, + "learning_rate": 6.028696898943853e-08, + "loss": 0.84431058, + "num_input_tokens_seen": 166141630, + "step": 7684, + "time_per_iteration": 2.450551748275757 + }, + { + "auxiliary_loss_clip": 0.01136665, + "auxiliary_loss_mlp": 0.00762501, + "balance_loss_clip": 1.04158926, + "balance_loss_mlp": 1.00044954, + "epoch": 0.924066614561414, + "flos": 21867120587520.0, + "grad_norm": 1.9420646668544417, + "language_loss": 0.70732617, + "learning_rate": 6.00972988719648e-08, + "loss": 0.72631782, + "num_input_tokens_seen": 166159865, + "step": 7685, + "time_per_iteration": 2.4805076122283936 + }, + { + "auxiliary_loss_clip": 0.01126211, + "auxiliary_loss_mlp": 0.00762283, + "balance_loss_clip": 1.04354739, + "balance_loss_mlp": 1.00046277, + "epoch": 0.9241868574520532, + "flos": 28511887495680.0, + "grad_norm": 2.324000694529363, + "language_loss": 0.70573151, + "learning_rate": 5.990792303618807e-08, + "loss": 0.72461641, + "num_input_tokens_seen": 166179445, + "step": 7686, + "time_per_iteration": 2.560917615890503 + }, + { + "auxiliary_loss_clip": 0.01123514, + "auxiliary_loss_mlp": 0.01019981, + "balance_loss_clip": 1.04523516, + "balance_loss_mlp": 1.01284671, + "epoch": 0.9243071003426923, + "flos": 30518324282880.0, + "grad_norm": 1.5605261130800718, + "language_loss": 0.69285089, + "learning_rate": 5.971884151083695e-08, + "loss": 0.71428585, + "num_input_tokens_seen": 166201855, + "step": 7687, + "time_per_iteration": 2.5913567543029785 + }, + { + "auxiliary_loss_clip": 0.01138238, + "auxiliary_loss_mlp": 0.01023886, + "balance_loss_clip": 1.04310644, + "balance_loss_mlp": 1.01737392, + "epoch": 0.9244273432333313, + "flos": 28658331244800.0, + "grad_norm": 1.761702590115966, + "language_loss": 0.74576336, + "learning_rate": 5.9530054324595124e-08, + "loss": 0.76738453, + "num_input_tokens_seen": 166221970, + "step": 7688, + "time_per_iteration": 2.5443081855773926 + }, + { + "auxiliary_loss_clip": 0.01046539, + "auxiliary_loss_mlp": 0.0075287, + "balance_loss_clip": 1.00763559, + "balance_loss_mlp": 0.99982464, + "epoch": 0.9245475861239704, + "flos": 66230589237120.0, + "grad_norm": 0.7171431574459651, + "language_loss": 0.5757215, + "learning_rate": 5.934156150610103e-08, + "loss": 0.59371555, + "num_input_tokens_seen": 166279335, + "step": 7689, + "time_per_iteration": 3.082620143890381 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01024706, + "balance_loss_clip": 1.04298222, + "balance_loss_mlp": 1.0172013, + "epoch": 0.9246678290146095, + "flos": 24239918142720.0, + "grad_norm": 3.624306164854581, + "language_loss": 0.78735125, + "learning_rate": 5.915336308394914e-08, + "loss": 0.80894774, + "num_input_tokens_seen": 166298170, + "step": 7690, + "time_per_iteration": 2.485715866088867 + }, + { + "auxiliary_loss_clip": 0.01145328, + "auxiliary_loss_mlp": 0.01022568, + "balance_loss_clip": 1.0442158, + "balance_loss_mlp": 1.01650369, + "epoch": 0.9247880719052486, + "flos": 18988808976000.0, + "grad_norm": 1.5879498549657893, + "language_loss": 0.76919603, + "learning_rate": 5.89654590866886e-08, + "loss": 0.79087508, + "num_input_tokens_seen": 166317670, + "step": 7691, + "time_per_iteration": 2.4459073543548584 + }, + { + "auxiliary_loss_clip": 0.01102379, + "auxiliary_loss_mlp": 0.0102443, + "balance_loss_clip": 1.04512763, + "balance_loss_mlp": 1.01674128, + "epoch": 0.9249083147958876, + "flos": 24024095274240.0, + "grad_norm": 1.9296176075650382, + "language_loss": 0.88285249, + "learning_rate": 5.877784954282483e-08, + "loss": 0.90412056, + "num_input_tokens_seen": 166337010, + "step": 7692, + "time_per_iteration": 2.593445301055908 + }, + { + "auxiliary_loss_clip": 0.01154752, + "auxiliary_loss_mlp": 0.01022227, + "balance_loss_clip": 1.04630804, + "balance_loss_mlp": 1.01444268, + "epoch": 0.9250285576865268, + "flos": 30773972355840.0, + "grad_norm": 1.9143729784944223, + "language_loss": 0.72114557, + "learning_rate": 5.8590534480817963e-08, + "loss": 0.74291539, + "num_input_tokens_seen": 166358735, + "step": 7693, + "time_per_iteration": 3.345564126968384 + }, + { + "auxiliary_loss_clip": 0.01166166, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.04837728, + "balance_loss_mlp": 1.01949334, + "epoch": 0.9251488005771659, + "flos": 10633581348480.0, + "grad_norm": 2.2040602087679444, + "language_loss": 0.72239274, + "learning_rate": 5.840351392908349e-08, + "loss": 0.74432003, + "num_input_tokens_seen": 166374455, + "step": 7694, + "time_per_iteration": 2.3825693130493164 + }, + { + "auxiliary_loss_clip": 0.01144394, + "auxiliary_loss_mlp": 0.00762162, + "balance_loss_clip": 1.04419816, + "balance_loss_mlp": 1.00046086, + "epoch": 0.9252690434678049, + "flos": 23586416052480.0, + "grad_norm": 2.525139122086702, + "language_loss": 0.70718002, + "learning_rate": 5.821678791599205e-08, + "loss": 0.72624552, + "num_input_tokens_seen": 166393900, + "step": 7695, + "time_per_iteration": 3.231208086013794 + }, + { + "auxiliary_loss_clip": 0.0113516, + "auxiliary_loss_mlp": 0.01022443, + "balance_loss_clip": 1.04476511, + "balance_loss_mlp": 1.01569295, + "epoch": 0.9253892863584441, + "flos": 21469158829440.0, + "grad_norm": 2.467006232156446, + "language_loss": 0.80672932, + "learning_rate": 5.803035646986965e-08, + "loss": 0.82830536, + "num_input_tokens_seen": 166413235, + "step": 7696, + "time_per_iteration": 3.293334484100342 + }, + { + "auxiliary_loss_clip": 0.01167685, + "auxiliary_loss_mlp": 0.0102406, + "balance_loss_clip": 1.04764295, + "balance_loss_mlp": 1.01640689, + "epoch": 0.9255095292490831, + "flos": 17456680304640.0, + "grad_norm": 2.690685647825443, + "language_loss": 0.67256463, + "learning_rate": 5.7844219618998766e-08, + "loss": 0.69448209, + "num_input_tokens_seen": 166427560, + "step": 7697, + "time_per_iteration": 2.397186756134033 + }, + { + "auxiliary_loss_clip": 0.01107582, + "auxiliary_loss_mlp": 0.01022265, + "balance_loss_clip": 1.03805733, + "balance_loss_mlp": 1.01510406, + "epoch": 0.9256297721397222, + "flos": 24750675584640.0, + "grad_norm": 1.7160769054601068, + "language_loss": 0.71809733, + "learning_rate": 5.765837739161505e-08, + "loss": 0.7393958, + "num_input_tokens_seen": 166446680, + "step": 7698, + "time_per_iteration": 2.542025566101074 + }, + { + "auxiliary_loss_clip": 0.01123132, + "auxiliary_loss_mlp": 0.01021095, + "balance_loss_clip": 1.04255581, + "balance_loss_mlp": 1.01413357, + "epoch": 0.9257500150303614, + "flos": 23112215677440.0, + "grad_norm": 2.253933801140475, + "language_loss": 0.74176347, + "learning_rate": 5.7472829815911504e-08, + "loss": 0.76320571, + "num_input_tokens_seen": 166465505, + "step": 7699, + "time_per_iteration": 2.5108723640441895 + }, + { + "auxiliary_loss_clip": 0.01133904, + "auxiliary_loss_mlp": 0.01025839, + "balance_loss_clip": 1.04401207, + "balance_loss_mlp": 1.01847148, + "epoch": 0.9258702579210004, + "flos": 22564685687040.0, + "grad_norm": 1.7266540174413394, + "language_loss": 0.81365013, + "learning_rate": 5.7287576920035164e-08, + "loss": 0.83524758, + "num_input_tokens_seen": 166484520, + "step": 7700, + "time_per_iteration": 2.488548517227173 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_clip": 1.04389286, + "balance_loss_mlp": 1.01470518, + "epoch": 0.9259905008116395, + "flos": 30004298703360.0, + "grad_norm": 2.234269878052166, + "language_loss": 0.76880634, + "learning_rate": 5.7102618732088435e-08, + "loss": 0.79022586, + "num_input_tokens_seen": 166503850, + "step": 7701, + "time_per_iteration": 2.5677061080932617 + }, + { + "auxiliary_loss_clip": 0.01142892, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.04523194, + "balance_loss_mlp": 1.02100372, + "epoch": 0.9261107437022786, + "flos": 24572128055040.0, + "grad_norm": 1.5401736708876554, + "language_loss": 0.74428922, + "learning_rate": 5.6917955280130216e-08, + "loss": 0.76599169, + "num_input_tokens_seen": 166525330, + "step": 7702, + "time_per_iteration": 2.5289223194122314 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01023195, + "balance_loss_clip": 1.04583275, + "balance_loss_mlp": 1.01612616, + "epoch": 0.9262309865929177, + "flos": 22018448586240.0, + "grad_norm": 2.1804399549653306, + "language_loss": 0.72180104, + "learning_rate": 5.6733586592172755e-08, + "loss": 0.74353099, + "num_input_tokens_seen": 166544825, + "step": 7703, + "time_per_iteration": 2.436396360397339 + }, + { + "auxiliary_loss_clip": 0.01129411, + "auxiliary_loss_mlp": 0.00761033, + "balance_loss_clip": 1.04053009, + "balance_loss_mlp": 1.00041819, + "epoch": 0.9263512294835567, + "flos": 20339481116160.0, + "grad_norm": 2.078291786574832, + "language_loss": 0.80020046, + "learning_rate": 5.6549512696185244e-08, + "loss": 0.81910491, + "num_input_tokens_seen": 166563325, + "step": 7704, + "time_per_iteration": 3.258502960205078 + }, + { + "auxiliary_loss_clip": 0.01162358, + "auxiliary_loss_mlp": 0.01021805, + "balance_loss_clip": 1.04607201, + "balance_loss_mlp": 1.01470888, + "epoch": 0.9264714723741959, + "flos": 21215378263680.0, + "grad_norm": 1.6007703747131554, + "language_loss": 0.68425059, + "learning_rate": 5.636573362009156e-08, + "loss": 0.70609224, + "num_input_tokens_seen": 166583385, + "step": 7705, + "time_per_iteration": 2.415381669998169 + }, + { + "auxiliary_loss_clip": 0.01167858, + "auxiliary_loss_mlp": 0.01026402, + "balance_loss_clip": 1.04826832, + "balance_loss_mlp": 1.01919281, + "epoch": 0.926591715264835, + "flos": 18004964480640.0, + "grad_norm": 3.776885144725662, + "language_loss": 0.77157074, + "learning_rate": 5.618224939177074e-08, + "loss": 0.79351336, + "num_input_tokens_seen": 166601290, + "step": 7706, + "time_per_iteration": 2.475008249282837 + }, + { + "auxiliary_loss_clip": 0.01125939, + "auxiliary_loss_mlp": 0.0102428, + "balance_loss_clip": 1.04210711, + "balance_loss_mlp": 1.01675177, + "epoch": 0.926711958155474, + "flos": 36167969825280.0, + "grad_norm": 1.7146859937973897, + "language_loss": 0.70432246, + "learning_rate": 5.599906003905719e-08, + "loss": 0.72582465, + "num_input_tokens_seen": 166623835, + "step": 7707, + "time_per_iteration": 2.598785877227783 + }, + { + "auxiliary_loss_clip": 0.01149999, + "auxiliary_loss_mlp": 0.01024928, + "balance_loss_clip": 1.04927945, + "balance_loss_mlp": 1.01750767, + "epoch": 0.9268322010461132, + "flos": 21032736583680.0, + "grad_norm": 2.7246534353093868, + "language_loss": 0.82195169, + "learning_rate": 5.581616558974023e-08, + "loss": 0.843701, + "num_input_tokens_seen": 166642400, + "step": 7708, + "time_per_iteration": 2.4794108867645264 + }, + { + "auxiliary_loss_clip": 0.01157854, + "auxiliary_loss_mlp": 0.00762039, + "balance_loss_clip": 1.0469873, + "balance_loss_mlp": 1.00044227, + "epoch": 0.9269524439367522, + "flos": 22964838174720.0, + "grad_norm": 2.5884560524548634, + "language_loss": 0.79461956, + "learning_rate": 5.5633566071565444e-08, + "loss": 0.81381845, + "num_input_tokens_seen": 166661640, + "step": 7709, + "time_per_iteration": 2.44502854347229 + }, + { + "auxiliary_loss_clip": 0.01098035, + "auxiliary_loss_mlp": 0.01021495, + "balance_loss_clip": 1.04028416, + "balance_loss_mlp": 1.01492405, + "epoch": 0.9270726868273913, + "flos": 41975551468800.0, + "grad_norm": 2.2718387141297023, + "language_loss": 0.70256007, + "learning_rate": 5.5451261512232896e-08, + "loss": 0.72375536, + "num_input_tokens_seen": 166684320, + "step": 7710, + "time_per_iteration": 2.740999937057495 + }, + { + "auxiliary_loss_clip": 0.01155226, + "auxiliary_loss_mlp": 0.01024191, + "balance_loss_clip": 1.0435636, + "balance_loss_mlp": 1.01687133, + "epoch": 0.9271929297180305, + "flos": 19791771557760.0, + "grad_norm": 2.1713689590747838, + "language_loss": 0.62382555, + "learning_rate": 5.5269251939397576e-08, + "loss": 0.64561969, + "num_input_tokens_seen": 166703835, + "step": 7711, + "time_per_iteration": 2.4188764095306396 + }, + { + "auxiliary_loss_clip": 0.01121206, + "auxiliary_loss_mlp": 0.01022366, + "balance_loss_clip": 1.03927302, + "balance_loss_mlp": 1.01551783, + "epoch": 0.9273131726086695, + "flos": 19968343839360.0, + "grad_norm": 1.9894028545136337, + "language_loss": 0.76486635, + "learning_rate": 5.508753738067073e-08, + "loss": 0.78630203, + "num_input_tokens_seen": 166723375, + "step": 7712, + "time_per_iteration": 2.510390043258667 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01023906, + "balance_loss_clip": 1.0429548, + "balance_loss_mlp": 1.01665831, + "epoch": 0.9274334154993086, + "flos": 23258587599360.0, + "grad_norm": 2.1511972505550183, + "language_loss": 0.79094118, + "learning_rate": 5.4906117863617875e-08, + "loss": 0.81270456, + "num_input_tokens_seen": 166742760, + "step": 7713, + "time_per_iteration": 2.449094533920288 + }, + { + "auxiliary_loss_clip": 0.01119578, + "auxiliary_loss_mlp": 0.01019551, + "balance_loss_clip": 1.04015279, + "balance_loss_mlp": 1.01269639, + "epoch": 0.9275536583899477, + "flos": 31795343585280.0, + "grad_norm": 1.9971500394771942, + "language_loss": 0.77926266, + "learning_rate": 5.4724993415760533e-08, + "loss": 0.80065393, + "num_input_tokens_seen": 166761115, + "step": 7714, + "time_per_iteration": 2.574300527572632 + }, + { + "auxiliary_loss_clip": 0.01131738, + "auxiliary_loss_mlp": 0.0076222, + "balance_loss_clip": 1.04331052, + "balance_loss_mlp": 1.00049901, + "epoch": 0.9276739012805868, + "flos": 18696998885760.0, + "grad_norm": 2.2041691207075473, + "language_loss": 0.74647439, + "learning_rate": 5.454416406457496e-08, + "loss": 0.765414, + "num_input_tokens_seen": 166780210, + "step": 7715, + "time_per_iteration": 2.484652519226074 + }, + { + "auxiliary_loss_clip": 0.01150329, + "auxiliary_loss_mlp": 0.01024749, + "balance_loss_clip": 1.04465127, + "balance_loss_mlp": 1.01838326, + "epoch": 0.9277941441712259, + "flos": 13879079740800.0, + "grad_norm": 3.8223549304381845, + "language_loss": 0.74234211, + "learning_rate": 5.436362983749299e-08, + "loss": 0.76409286, + "num_input_tokens_seen": 166795380, + "step": 7716, + "time_per_iteration": 2.382547378540039 + }, + { + "auxiliary_loss_clip": 0.0111638, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.04309928, + "balance_loss_mlp": 1.01898551, + "epoch": 0.927914387061865, + "flos": 23258659426560.0, + "grad_norm": 1.9003039239133317, + "language_loss": 0.64127475, + "learning_rate": 5.418339076190137e-08, + "loss": 0.66269279, + "num_input_tokens_seen": 166814890, + "step": 7717, + "time_per_iteration": 2.507246732711792 + }, + { + "auxiliary_loss_clip": 0.01130186, + "auxiliary_loss_mlp": 0.01019055, + "balance_loss_clip": 1.04356813, + "balance_loss_mlp": 1.01203942, + "epoch": 0.9280346299525041, + "flos": 18073733068800.0, + "grad_norm": 1.824236892508934, + "language_loss": 0.88926315, + "learning_rate": 5.400344686514202e-08, + "loss": 0.91075552, + "num_input_tokens_seen": 166832475, + "step": 7718, + "time_per_iteration": 2.4689576625823975 + }, + { + "auxiliary_loss_clip": 0.01151405, + "auxiliary_loss_mlp": 0.01020594, + "balance_loss_clip": 1.0477742, + "balance_loss_mlp": 1.01370335, + "epoch": 0.9281548728431431, + "flos": 22342901160960.0, + "grad_norm": 1.928529545349362, + "language_loss": 0.66944981, + "learning_rate": 5.38237981745131e-08, + "loss": 0.69116986, + "num_input_tokens_seen": 166850590, + "step": 7719, + "time_per_iteration": 2.4414563179016113 + }, + { + "auxiliary_loss_clip": 0.01154081, + "auxiliary_loss_mlp": 0.00761852, + "balance_loss_clip": 1.04626405, + "balance_loss_mlp": 1.00044453, + "epoch": 0.9282751157337822, + "flos": 18843765857280.0, + "grad_norm": 1.7148912425862641, + "language_loss": 0.81268364, + "learning_rate": 5.364444471726592e-08, + "loss": 0.83184296, + "num_input_tokens_seen": 166869795, + "step": 7720, + "time_per_iteration": 3.2681591510772705 + }, + { + "auxiliary_loss_clip": 0.01150214, + "auxiliary_loss_mlp": 0.01022147, + "balance_loss_clip": 1.04457581, + "balance_loss_mlp": 1.01527441, + "epoch": 0.9283953586244214, + "flos": 25556834476800.0, + "grad_norm": 2.1036630140560133, + "language_loss": 0.80129731, + "learning_rate": 5.346538652060939e-08, + "loss": 0.82302094, + "num_input_tokens_seen": 166891150, + "step": 7721, + "time_per_iteration": 3.2041115760803223 + }, + { + "auxiliary_loss_clip": 0.0113554, + "auxiliary_loss_mlp": 0.01023081, + "balance_loss_clip": 1.04560232, + "balance_loss_mlp": 1.0163902, + "epoch": 0.9285156015150604, + "flos": 18223480869120.0, + "grad_norm": 1.8075976976897448, + "language_loss": 0.70257974, + "learning_rate": 5.3286623611705994e-08, + "loss": 0.72416592, + "num_input_tokens_seen": 166909195, + "step": 7722, + "time_per_iteration": 2.441241979598999 + }, + { + "auxiliary_loss_clip": 0.0106071, + "auxiliary_loss_mlp": 0.01000679, + "balance_loss_clip": 1.00722241, + "balance_loss_mlp": 0.99983847, + "epoch": 0.9286358444056995, + "flos": 66400017690240.0, + "grad_norm": 0.8209562883633076, + "language_loss": 0.60578537, + "learning_rate": 5.3108156017673824e-08, + "loss": 0.62639928, + "num_input_tokens_seen": 166970955, + "step": 7723, + "time_per_iteration": 3.8582022190093994 + }, + { + "auxiliary_loss_clip": 0.01143751, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.04489994, + "balance_loss_mlp": 1.01667273, + "epoch": 0.9287560872963386, + "flos": 22345630594560.0, + "grad_norm": 1.7369532673867514, + "language_loss": 0.71586514, + "learning_rate": 5.2929983765586775e-08, + "loss": 0.73754817, + "num_input_tokens_seen": 166989735, + "step": 7724, + "time_per_iteration": 2.4824748039245605 + }, + { + "auxiliary_loss_clip": 0.01168027, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.05047107, + "balance_loss_mlp": 1.01937258, + "epoch": 0.9288763301869777, + "flos": 25700225569920.0, + "grad_norm": 1.9292333302726883, + "language_loss": 0.62602824, + "learning_rate": 5.275210688247278e-08, + "loss": 0.64796841, + "num_input_tokens_seen": 167010060, + "step": 7725, + "time_per_iteration": 2.4439094066619873 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.04322231, + "balance_loss_mlp": 1.01732993, + "epoch": 0.9289965730776167, + "flos": 12312046028160.0, + "grad_norm": 2.5179880367599026, + "language_loss": 0.84967697, + "learning_rate": 5.257452539531604e-08, + "loss": 0.87102473, + "num_input_tokens_seen": 167027130, + "step": 7726, + "time_per_iteration": 2.5316436290740967 + }, + { + "auxiliary_loss_clip": 0.01151051, + "auxiliary_loss_mlp": 0.01025356, + "balance_loss_clip": 1.04416776, + "balance_loss_mlp": 1.01818252, + "epoch": 0.9291168159682559, + "flos": 26685973486080.0, + "grad_norm": 1.712772720054803, + "language_loss": 0.68287933, + "learning_rate": 5.2397239331055445e-08, + "loss": 0.70464337, + "num_input_tokens_seen": 167049130, + "step": 7727, + "time_per_iteration": 2.495070219039917 + }, + { + "auxiliary_loss_clip": 0.01133389, + "auxiliary_loss_mlp": 0.01022613, + "balance_loss_clip": 1.04475236, + "balance_loss_mlp": 1.01545489, + "epoch": 0.929237058858895, + "flos": 14538256179840.0, + "grad_norm": 1.9723114897677214, + "language_loss": 0.81500828, + "learning_rate": 5.2220248716585036e-08, + "loss": 0.8365683, + "num_input_tokens_seen": 167066810, + "step": 7728, + "time_per_iteration": 2.4856009483337402 + }, + { + "auxiliary_loss_clip": 0.01142898, + "auxiliary_loss_mlp": 0.01026097, + "balance_loss_clip": 1.04326773, + "balance_loss_mlp": 1.01881921, + "epoch": 0.929357301749534, + "flos": 23835456023040.0, + "grad_norm": 2.0918458255568253, + "language_loss": 0.75484776, + "learning_rate": 5.204355357875445e-08, + "loss": 0.77653766, + "num_input_tokens_seen": 167085155, + "step": 7729, + "time_per_iteration": 2.4698424339294434 + }, + { + "auxiliary_loss_clip": 0.01134139, + "auxiliary_loss_mlp": 0.01021903, + "balance_loss_clip": 1.0421741, + "balance_loss_mlp": 1.01469111, + "epoch": 0.9294775446401732, + "flos": 12969319046400.0, + "grad_norm": 3.819865613437238, + "language_loss": 0.70565271, + "learning_rate": 5.1867153944367584e-08, + "loss": 0.72721314, + "num_input_tokens_seen": 167101545, + "step": 7730, + "time_per_iteration": 3.2055585384368896 + }, + { + "auxiliary_loss_clip": 0.01129575, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.04517806, + "balance_loss_mlp": 1.02137816, + "epoch": 0.9295977875308122, + "flos": 26211809024640.0, + "grad_norm": 1.9733551291917715, + "language_loss": 0.73428869, + "learning_rate": 5.16910498401848e-08, + "loss": 0.75586867, + "num_input_tokens_seen": 167120995, + "step": 7731, + "time_per_iteration": 2.55090594291687 + }, + { + "auxiliary_loss_clip": 0.01164265, + "auxiliary_loss_mlp": 0.01022809, + "balance_loss_clip": 1.04812241, + "balance_loss_mlp": 1.01623213, + "epoch": 0.9297180304214513, + "flos": 16472297105280.0, + "grad_norm": 2.761499483817806, + "language_loss": 0.83834797, + "learning_rate": 5.151524129292073e-08, + "loss": 0.86021876, + "num_input_tokens_seen": 167138890, + "step": 7732, + "time_per_iteration": 2.360590934753418 + }, + { + "auxiliary_loss_clip": 0.01148188, + "auxiliary_loss_mlp": 0.01025616, + "balance_loss_clip": 1.04436302, + "balance_loss_mlp": 1.01897931, + "epoch": 0.9298382733120905, + "flos": 24060436859520.0, + "grad_norm": 1.8887326872437653, + "language_loss": 0.66394758, + "learning_rate": 5.1339728329245155e-08, + "loss": 0.68568558, + "num_input_tokens_seen": 167159455, + "step": 7733, + "time_per_iteration": 2.4520535469055176 + }, + { + "auxiliary_loss_clip": 0.01170393, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.04811764, + "balance_loss_mlp": 1.02027082, + "epoch": 0.9299585162027295, + "flos": 22127652910080.0, + "grad_norm": 8.35413144269687, + "language_loss": 0.78955173, + "learning_rate": 5.116451097578367e-08, + "loss": 0.81153399, + "num_input_tokens_seen": 167178495, + "step": 7734, + "time_per_iteration": 2.397625684738159 + }, + { + "auxiliary_loss_clip": 0.01121863, + "auxiliary_loss_mlp": 0.01024547, + "balance_loss_clip": 1.04243064, + "balance_loss_mlp": 1.01761496, + "epoch": 0.9300787590933686, + "flos": 21471780522240.0, + "grad_norm": 1.605495500273465, + "language_loss": 0.74439311, + "learning_rate": 5.0989589259115895e-08, + "loss": 0.76585722, + "num_input_tokens_seen": 167199380, + "step": 7735, + "time_per_iteration": 2.513848066329956 + }, + { + "auxiliary_loss_clip": 0.01148072, + "auxiliary_loss_mlp": 0.01026779, + "balance_loss_clip": 1.04247499, + "balance_loss_mlp": 1.01885128, + "epoch": 0.9301990019840077, + "flos": 17779588594560.0, + "grad_norm": 1.8065433676061218, + "language_loss": 0.71542984, + "learning_rate": 5.081496320577816e-08, + "loss": 0.73717839, + "num_input_tokens_seen": 167216500, + "step": 7736, + "time_per_iteration": 2.395390272140503 + }, + { + "auxiliary_loss_clip": 0.01045717, + "auxiliary_loss_mlp": 0.01002243, + "balance_loss_clip": 1.01703501, + "balance_loss_mlp": 1.00104511, + "epoch": 0.9303192448746468, + "flos": 58896122307840.0, + "grad_norm": 0.9145884418444209, + "language_loss": 0.61160982, + "learning_rate": 5.0640632842260835e-08, + "loss": 0.63208944, + "num_input_tokens_seen": 167276760, + "step": 7737, + "time_per_iteration": 3.1183815002441406 + }, + { + "auxiliary_loss_clip": 0.01120504, + "auxiliary_loss_mlp": 0.00761968, + "balance_loss_clip": 1.04490495, + "balance_loss_mlp": 1.00042558, + "epoch": 0.9304394877652858, + "flos": 57663522172800.0, + "grad_norm": 1.5205629772131883, + "language_loss": 0.72742152, + "learning_rate": 5.0466598195009426e-08, + "loss": 0.74624628, + "num_input_tokens_seen": 167303630, + "step": 7738, + "time_per_iteration": 2.83030366897583 + }, + { + "auxiliary_loss_clip": 0.01123498, + "auxiliary_loss_mlp": 0.01022881, + "balance_loss_clip": 1.04342103, + "balance_loss_mlp": 1.01574373, + "epoch": 0.930559730655925, + "flos": 20996143603200.0, + "grad_norm": 1.885378797414096, + "language_loss": 0.70368576, + "learning_rate": 5.0292859290425036e-08, + "loss": 0.72514963, + "num_input_tokens_seen": 167321500, + "step": 7739, + "time_per_iteration": 2.476635456085205 + }, + { + "auxiliary_loss_clip": 0.01164687, + "auxiliary_loss_mlp": 0.01022429, + "balance_loss_clip": 1.04824924, + "balance_loss_mlp": 1.01592886, + "epoch": 0.9306799735465641, + "flos": 23258264376960.0, + "grad_norm": 1.907589577368997, + "language_loss": 0.7770347, + "learning_rate": 5.011941615486348e-08, + "loss": 0.79890585, + "num_input_tokens_seen": 167340615, + "step": 7740, + "time_per_iteration": 2.418637990951538 + }, + { + "auxiliary_loss_clip": 0.0116328, + "auxiliary_loss_mlp": 0.01020981, + "balance_loss_clip": 1.0453999, + "balance_loss_mlp": 1.01398671, + "epoch": 0.9308002164372031, + "flos": 15231547560960.0, + "grad_norm": 2.223850965521068, + "language_loss": 0.84600842, + "learning_rate": 4.994626881463659e-08, + "loss": 0.86785102, + "num_input_tokens_seen": 167356870, + "step": 7741, + "time_per_iteration": 2.373375415802002 + }, + { + "auxiliary_loss_clip": 0.0109311, + "auxiliary_loss_mlp": 0.01023941, + "balance_loss_clip": 1.03836513, + "balance_loss_mlp": 1.01694906, + "epoch": 0.9309204593278423, + "flos": 30847481539200.0, + "grad_norm": 2.4769621717711185, + "language_loss": 0.71428066, + "learning_rate": 4.9773417296009814e-08, + "loss": 0.73545122, + "num_input_tokens_seen": 167378390, + "step": 7742, + "time_per_iteration": 2.5885632038116455 + }, + { + "auxiliary_loss_clip": 0.01156992, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.04703808, + "balance_loss_mlp": 1.02171302, + "epoch": 0.9310407022184813, + "flos": 23037269950080.0, + "grad_norm": 3.556905768030258, + "language_loss": 0.65484577, + "learning_rate": 4.960086162520527e-08, + "loss": 0.67670321, + "num_input_tokens_seen": 167398480, + "step": 7743, + "time_per_iteration": 2.469297409057617 + }, + { + "auxiliary_loss_clip": 0.01117885, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.04314637, + "balance_loss_mlp": 1.01715684, + "epoch": 0.9311609451091204, + "flos": 22127976132480.0, + "grad_norm": 2.0493586048885253, + "language_loss": 0.82337803, + "learning_rate": 4.942860182839936e-08, + "loss": 0.84479845, + "num_input_tokens_seen": 167416825, + "step": 7744, + "time_per_iteration": 2.5263147354125977 + }, + { + "auxiliary_loss_clip": 0.01135616, + "auxiliary_loss_mlp": 0.010244, + "balance_loss_clip": 1.04458964, + "balance_loss_mlp": 1.01740253, + "epoch": 0.9312811879997596, + "flos": 21099206701440.0, + "grad_norm": 1.8076055369687123, + "language_loss": 0.79525733, + "learning_rate": 4.925663793172341e-08, + "loss": 0.81685758, + "num_input_tokens_seen": 167434785, + "step": 7745, + "time_per_iteration": 2.477511167526245 + }, + { + "auxiliary_loss_clip": 0.01034751, + "auxiliary_loss_mlp": 0.00752947, + "balance_loss_clip": 1.00645757, + "balance_loss_mlp": 0.99990386, + "epoch": 0.9314014308903986, + "flos": 67148179096320.0, + "grad_norm": 0.7989216417284355, + "language_loss": 0.5653435, + "learning_rate": 4.908496996126477e-08, + "loss": 0.58322048, + "num_input_tokens_seen": 167498245, + "step": 7746, + "time_per_iteration": 3.1061158180236816 + }, + { + "auxiliary_loss_clip": 0.01153219, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.050313, + "balance_loss_mlp": 1.02121568, + "epoch": 0.9315216737810377, + "flos": 22565583527040.0, + "grad_norm": 1.4371260626113949, + "language_loss": 0.76294762, + "learning_rate": 4.89135979430646e-08, + "loss": 0.78476655, + "num_input_tokens_seen": 167518290, + "step": 7747, + "time_per_iteration": 3.307936191558838 + }, + { + "auxiliary_loss_clip": 0.01166726, + "auxiliary_loss_mlp": 0.01023054, + "balance_loss_clip": 1.04971361, + "balance_loss_mlp": 1.01529002, + "epoch": 0.9316419166716768, + "flos": 23984054588160.0, + "grad_norm": 1.7163494940621176, + "language_loss": 0.85623944, + "learning_rate": 4.874252190312078e-08, + "loss": 0.87813723, + "num_input_tokens_seen": 167538675, + "step": 7748, + "time_per_iteration": 3.2668607234954834 + }, + { + "auxiliary_loss_clip": 0.01154245, + "auxiliary_loss_mlp": 0.0102253, + "balance_loss_clip": 1.04525566, + "balance_loss_mlp": 1.01544046, + "epoch": 0.9317621595623159, + "flos": 30230464688640.0, + "grad_norm": 1.8446139642175259, + "language_loss": 0.64664161, + "learning_rate": 4.857174186738477e-08, + "loss": 0.66840935, + "num_input_tokens_seen": 167562025, + "step": 7749, + "time_per_iteration": 3.336543560028076 + }, + { + "auxiliary_loss_clip": 0.01167269, + "auxiliary_loss_mlp": 0.01024198, + "balance_loss_clip": 1.04961932, + "balance_loss_mlp": 1.0169208, + "epoch": 0.931882402452955, + "flos": 15742735966080.0, + "grad_norm": 2.2914542483161298, + "language_loss": 0.73243737, + "learning_rate": 4.840125786176408e-08, + "loss": 0.75435209, + "num_input_tokens_seen": 167578230, + "step": 7750, + "time_per_iteration": 2.376410484313965 + }, + { + "auxiliary_loss_clip": 0.01134074, + "auxiliary_loss_mlp": 0.01024268, + "balance_loss_clip": 1.04345489, + "balance_loss_mlp": 1.01722527, + "epoch": 0.932002645343594, + "flos": 28366521154560.0, + "grad_norm": 1.8739850468914647, + "language_loss": 0.77088964, + "learning_rate": 4.823106991212067e-08, + "loss": 0.79247302, + "num_input_tokens_seen": 167597470, + "step": 7751, + "time_per_iteration": 2.5340423583984375 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.04476213, + "balance_loss_mlp": 1.01700163, + "epoch": 0.9321228882342332, + "flos": 15341146934400.0, + "grad_norm": 2.216654090081813, + "language_loss": 0.83282322, + "learning_rate": 4.806117804427212e-08, + "loss": 0.85457915, + "num_input_tokens_seen": 167615405, + "step": 7752, + "time_per_iteration": 2.4378795623779297 + }, + { + "auxiliary_loss_clip": 0.01144995, + "auxiliary_loss_mlp": 0.01027329, + "balance_loss_clip": 1.0427556, + "balance_loss_mlp": 1.01986051, + "epoch": 0.9322431311248722, + "flos": 17895365107200.0, + "grad_norm": 3.744604978197083, + "language_loss": 0.63993376, + "learning_rate": 4.7891582283990926e-08, + "loss": 0.66165698, + "num_input_tokens_seen": 167634130, + "step": 7753, + "time_per_iteration": 2.406168222427368 + }, + { + "auxiliary_loss_clip": 0.01122858, + "auxiliary_loss_mlp": 0.01019465, + "balance_loss_clip": 1.0418663, + "balance_loss_mlp": 1.01253879, + "epoch": 0.9323633740155113, + "flos": 24169713010560.0, + "grad_norm": 1.5018631277508483, + "language_loss": 0.72707492, + "learning_rate": 4.772228265700473e-08, + "loss": 0.74849814, + "num_input_tokens_seen": 167654990, + "step": 7754, + "time_per_iteration": 2.5253841876983643 + }, + { + "auxiliary_loss_clip": 0.01155114, + "auxiliary_loss_mlp": 0.01025631, + "balance_loss_clip": 1.0465281, + "balance_loss_mlp": 1.0185976, + "epoch": 0.9324836169061504, + "flos": 15043482927360.0, + "grad_norm": 2.2490912748123106, + "language_loss": 0.7558254, + "learning_rate": 4.75532791889961e-08, + "loss": 0.77763277, + "num_input_tokens_seen": 167671690, + "step": 7755, + "time_per_iteration": 2.393376350402832 + }, + { + "auxiliary_loss_clip": 0.01148558, + "auxiliary_loss_mlp": 0.01024901, + "balance_loss_clip": 1.04360056, + "balance_loss_mlp": 1.01761401, + "epoch": 0.9326038597967895, + "flos": 18624890332800.0, + "grad_norm": 1.9383887156947166, + "language_loss": 0.65799683, + "learning_rate": 4.738457190560252e-08, + "loss": 0.67973143, + "num_input_tokens_seen": 167690800, + "step": 7756, + "time_per_iteration": 3.1505818367004395 + }, + { + "auxiliary_loss_clip": 0.01110736, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.04478145, + "balance_loss_mlp": 1.0183084, + "epoch": 0.9327241026874286, + "flos": 18952646958720.0, + "grad_norm": 2.0157851272316516, + "language_loss": 0.78822607, + "learning_rate": 4.721616083241664e-08, + "loss": 0.80958921, + "num_input_tokens_seen": 167709055, + "step": 7757, + "time_per_iteration": 2.5190184116363525 + }, + { + "auxiliary_loss_clip": 0.01147471, + "auxiliary_loss_mlp": 0.01023643, + "balance_loss_clip": 1.04522109, + "balance_loss_mlp": 1.016747, + "epoch": 0.9328443455780677, + "flos": 29570282668800.0, + "grad_norm": 1.6793944576780386, + "language_loss": 0.77681106, + "learning_rate": 4.7048045994986684e-08, + "loss": 0.79852223, + "num_input_tokens_seen": 167729915, + "step": 7758, + "time_per_iteration": 2.4970245361328125 + }, + { + "auxiliary_loss_clip": 0.01157731, + "auxiliary_loss_mlp": 0.01023386, + "balance_loss_clip": 1.04786968, + "balance_loss_mlp": 1.01630521, + "epoch": 0.9329645884687068, + "flos": 30081722469120.0, + "grad_norm": 2.9284981467544413, + "language_loss": 0.90691423, + "learning_rate": 4.688022741881559e-08, + "loss": 0.92872536, + "num_input_tokens_seen": 167750440, + "step": 7759, + "time_per_iteration": 2.4959499835968018 + }, + { + "auxiliary_loss_clip": 0.01147888, + "auxiliary_loss_mlp": 0.01022146, + "balance_loss_clip": 1.04418719, + "balance_loss_mlp": 1.01595926, + "epoch": 0.9330848313593458, + "flos": 21867982513920.0, + "grad_norm": 1.5600949313376908, + "language_loss": 0.75029552, + "learning_rate": 4.671270512936076e-08, + "loss": 0.77199578, + "num_input_tokens_seen": 167769600, + "step": 7760, + "time_per_iteration": 2.4216272830963135 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01021622, + "balance_loss_clip": 1.04108334, + "balance_loss_mlp": 1.01483846, + "epoch": 0.933205074249985, + "flos": 22127221946880.0, + "grad_norm": 1.6774029024005355, + "language_loss": 0.82507986, + "learning_rate": 4.6545479152035884e-08, + "loss": 0.84645379, + "num_input_tokens_seen": 167788770, + "step": 7761, + "time_per_iteration": 2.50838565826416 + }, + { + "auxiliary_loss_clip": 0.01150832, + "auxiliary_loss_mlp": 0.01020741, + "balance_loss_clip": 1.0461936, + "balance_loss_mlp": 1.01398754, + "epoch": 0.9333253171406241, + "flos": 15341254675200.0, + "grad_norm": 1.902626518266492, + "language_loss": 0.7608161, + "learning_rate": 4.637854951220821e-08, + "loss": 0.78253186, + "num_input_tokens_seen": 167805555, + "step": 7762, + "time_per_iteration": 2.3953194618225098 + }, + { + "auxiliary_loss_clip": 0.01118119, + "auxiliary_loss_mlp": 0.01021795, + "balance_loss_clip": 1.04127622, + "balance_loss_mlp": 1.01502419, + "epoch": 0.9334455600312631, + "flos": 15706142985600.0, + "grad_norm": 2.2045194435547866, + "language_loss": 0.74879515, + "learning_rate": 4.621191623520171e-08, + "loss": 0.77019429, + "num_input_tokens_seen": 167823985, + "step": 7763, + "time_per_iteration": 2.4767022132873535 + }, + { + "auxiliary_loss_clip": 0.01109057, + "auxiliary_loss_mlp": 0.01025436, + "balance_loss_clip": 1.04244483, + "balance_loss_mlp": 1.01818228, + "epoch": 0.9335658029219023, + "flos": 22163563532160.0, + "grad_norm": 2.4869808445278814, + "language_loss": 0.84675682, + "learning_rate": 4.604557934629372e-08, + "loss": 0.86810172, + "num_input_tokens_seen": 167843060, + "step": 7764, + "time_per_iteration": 2.5723752975463867 + }, + { + "auxiliary_loss_clip": 0.01133255, + "auxiliary_loss_mlp": 0.01021582, + "balance_loss_clip": 1.04558575, + "balance_loss_mlp": 1.01515388, + "epoch": 0.9336860458125413, + "flos": 20266833859200.0, + "grad_norm": 1.8042832881352566, + "language_loss": 0.80361527, + "learning_rate": 4.587953887071805e-08, + "loss": 0.82516366, + "num_input_tokens_seen": 167862880, + "step": 7765, + "time_per_iteration": 2.459326982498169 + }, + { + "auxiliary_loss_clip": 0.01133867, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.04234624, + "balance_loss_mlp": 1.019485, + "epoch": 0.9338062887031804, + "flos": 20919689504640.0, + "grad_norm": 15.34083718671415, + "language_loss": 0.85835814, + "learning_rate": 4.5713794833662554e-08, + "loss": 0.87996364, + "num_input_tokens_seen": 167882095, + "step": 7766, + "time_per_iteration": 2.469886064529419 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.04746377, + "balance_loss_mlp": 1.01961958, + "epoch": 0.9339265315938196, + "flos": 23221635482880.0, + "grad_norm": 1.8208497749642827, + "language_loss": 0.63362467, + "learning_rate": 4.5548347260270236e-08, + "loss": 0.65555209, + "num_input_tokens_seen": 167901385, + "step": 7767, + "time_per_iteration": 2.42073392868042 + }, + { + "auxiliary_loss_clip": 0.01119422, + "auxiliary_loss_mlp": 0.01021985, + "balance_loss_clip": 1.04212189, + "balance_loss_mlp": 1.01518703, + "epoch": 0.9340467744844586, + "flos": 22820261932800.0, + "grad_norm": 1.7006399955007752, + "language_loss": 0.69158447, + "learning_rate": 4.538319617564012e-08, + "loss": 0.71299851, + "num_input_tokens_seen": 167920405, + "step": 7768, + "time_per_iteration": 2.503188133239746 + }, + { + "auxiliary_loss_clip": 0.01132618, + "auxiliary_loss_mlp": 0.01022276, + "balance_loss_clip": 1.04027438, + "balance_loss_mlp": 1.01530838, + "epoch": 0.9341670173750977, + "flos": 23660428026240.0, + "grad_norm": 1.8903753494492697, + "language_loss": 0.7408365, + "learning_rate": 4.521834160482485e-08, + "loss": 0.76238549, + "num_input_tokens_seen": 167939145, + "step": 7769, + "time_per_iteration": 2.5169825553894043 + }, + { + "auxiliary_loss_clip": 0.01152735, + "auxiliary_loss_mlp": 0.01026104, + "balance_loss_clip": 1.0453434, + "balance_loss_mlp": 1.0189997, + "epoch": 0.9342872602657368, + "flos": 24824256595200.0, + "grad_norm": 1.5718700654669397, + "language_loss": 0.81908619, + "learning_rate": 4.5053783572832846e-08, + "loss": 0.84087461, + "num_input_tokens_seen": 167959325, + "step": 7770, + "time_per_iteration": 2.4633185863494873 + }, + { + "auxiliary_loss_clip": 0.01152006, + "auxiliary_loss_mlp": 0.01024606, + "balance_loss_clip": 1.04681134, + "balance_loss_mlp": 1.01768017, + "epoch": 0.9344075031563759, + "flos": 25771831332480.0, + "grad_norm": 1.666095821348059, + "language_loss": 0.76537269, + "learning_rate": 4.488952210462771e-08, + "loss": 0.78713882, + "num_input_tokens_seen": 167979530, + "step": 7771, + "time_per_iteration": 2.487454414367676 + }, + { + "auxiliary_loss_clip": 0.01162954, + "auxiliary_loss_mlp": 0.01021831, + "balance_loss_clip": 1.04672909, + "balance_loss_mlp": 1.01490474, + "epoch": 0.9345277460470149, + "flos": 25551303782400.0, + "grad_norm": 2.3990158235627224, + "language_loss": 0.8547321, + "learning_rate": 4.4725557225127495e-08, + "loss": 0.87658, + "num_input_tokens_seen": 167997870, + "step": 7772, + "time_per_iteration": 2.4244980812072754 + }, + { + "auxiliary_loss_clip": 0.01152041, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.0475527, + "balance_loss_mlp": 1.02057409, + "epoch": 0.9346479889376541, + "flos": 34313112432000.0, + "grad_norm": 1.8285645013147327, + "language_loss": 0.79216981, + "learning_rate": 4.456188895920565e-08, + "loss": 0.81395984, + "num_input_tokens_seen": 168019625, + "step": 7773, + "time_per_iteration": 2.5352118015289307 + }, + { + "auxiliary_loss_clip": 0.011659, + "auxiliary_loss_mlp": 0.01024555, + "balance_loss_clip": 1.0481379, + "balance_loss_mlp": 1.01721811, + "epoch": 0.9347682318282932, + "flos": 19093739581440.0, + "grad_norm": 1.969581702050408, + "language_loss": 0.85555845, + "learning_rate": 4.439851733169031e-08, + "loss": 0.87746298, + "num_input_tokens_seen": 168037415, + "step": 7774, + "time_per_iteration": 3.2043185234069824 + }, + { + "auxiliary_loss_clip": 0.01123122, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.04230046, + "balance_loss_mlp": 1.02273536, + "epoch": 0.9348884747189322, + "flos": 26249587153920.0, + "grad_norm": 2.5685939524587074, + "language_loss": 0.69176096, + "learning_rate": 4.4235442367365204e-08, + "loss": 0.71328622, + "num_input_tokens_seen": 168057725, + "step": 7775, + "time_per_iteration": 3.276679039001465 + }, + { + "auxiliary_loss_clip": 0.01131988, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.04113317, + "balance_loss_mlp": 1.01660895, + "epoch": 0.9350087176095714, + "flos": 18333080242560.0, + "grad_norm": 2.3176809293446397, + "language_loss": 0.79683447, + "learning_rate": 4.4072664090968545e-08, + "loss": 0.81839418, + "num_input_tokens_seen": 168076110, + "step": 7776, + "time_per_iteration": 3.250821590423584 + }, + { + "auxiliary_loss_clip": 0.01136684, + "auxiliary_loss_mlp": 0.01025387, + "balance_loss_clip": 1.04198039, + "balance_loss_mlp": 1.01855874, + "epoch": 0.9351289605002104, + "flos": 19318253541120.0, + "grad_norm": 1.8559219383452683, + "language_loss": 0.84946561, + "learning_rate": 4.391018252719347e-08, + "loss": 0.8710863, + "num_input_tokens_seen": 168095905, + "step": 7777, + "time_per_iteration": 2.4717464447021484 + }, + { + "auxiliary_loss_clip": 0.01137661, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.0426228, + "balance_loss_mlp": 1.02189183, + "epoch": 0.9352492033908495, + "flos": 18799990156800.0, + "grad_norm": 1.9409895288930221, + "language_loss": 0.6928851, + "learning_rate": 4.374799770068849e-08, + "loss": 0.71455467, + "num_input_tokens_seen": 168112580, + "step": 7778, + "time_per_iteration": 2.4392635822296143 + }, + { + "auxiliary_loss_clip": 0.01148175, + "auxiliary_loss_mlp": 0.01023359, + "balance_loss_clip": 1.04526949, + "balance_loss_mlp": 1.01598263, + "epoch": 0.9353694462814887, + "flos": 29530134241920.0, + "grad_norm": 1.9265869928941481, + "language_loss": 0.74665433, + "learning_rate": 4.358610963605658e-08, + "loss": 0.76836967, + "num_input_tokens_seen": 168133030, + "step": 7779, + "time_per_iteration": 2.4740631580352783 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.04889739, + "balance_loss_mlp": 1.0265367, + "epoch": 0.9354896891721277, + "flos": 30665450390400.0, + "grad_norm": 2.273553290300527, + "language_loss": 0.68731302, + "learning_rate": 4.342451835785677e-08, + "loss": 0.70932937, + "num_input_tokens_seen": 168153940, + "step": 7780, + "time_per_iteration": 2.4637398719787598 + }, + { + "auxiliary_loss_clip": 0.01134468, + "auxiliary_loss_mlp": 0.01022129, + "balance_loss_clip": 1.04410577, + "balance_loss_mlp": 1.01531363, + "epoch": 0.9356099320627668, + "flos": 19463907191040.0, + "grad_norm": 1.5673099519592728, + "language_loss": 0.74924242, + "learning_rate": 4.3263223890601665e-08, + "loss": 0.7708084, + "num_input_tokens_seen": 168172650, + "step": 7781, + "time_per_iteration": 2.440971612930298 + }, + { + "auxiliary_loss_clip": 0.0114648, + "auxiliary_loss_mlp": 0.00761531, + "balance_loss_clip": 1.04738081, + "balance_loss_mlp": 1.00047398, + "epoch": 0.9357301749534058, + "flos": 19098156954240.0, + "grad_norm": 1.935911680435023, + "language_loss": 0.79572177, + "learning_rate": 4.31022262587597e-08, + "loss": 0.81480187, + "num_input_tokens_seen": 168191325, + "step": 7782, + "time_per_iteration": 2.4253463745117188 + }, + { + "auxiliary_loss_clip": 0.01150224, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.04578006, + "balance_loss_mlp": 1.02001333, + "epoch": 0.935850417844045, + "flos": 23550361776000.0, + "grad_norm": 1.580729245362894, + "language_loss": 0.6575495, + "learning_rate": 4.2941525486754225e-08, + "loss": 0.67933214, + "num_input_tokens_seen": 168211645, + "step": 7783, + "time_per_iteration": 3.1668505668640137 + }, + { + "auxiliary_loss_clip": 0.01116851, + "auxiliary_loss_mlp": 0.01022312, + "balance_loss_clip": 1.0418303, + "balance_loss_mlp": 1.01605356, + "epoch": 0.935970660734684, + "flos": 18588333265920.0, + "grad_norm": 1.7964190534449191, + "language_loss": 0.7951709, + "learning_rate": 4.278112159896286e-08, + "loss": 0.81656253, + "num_input_tokens_seen": 168229485, + "step": 7784, + "time_per_iteration": 2.4852676391601562 + }, + { + "auxiliary_loss_clip": 0.01128844, + "auxiliary_loss_mlp": 0.01021092, + "balance_loss_clip": 1.0398128, + "balance_loss_mlp": 1.0146699, + "epoch": 0.9360909036253231, + "flos": 20631255292800.0, + "grad_norm": 1.6579822404683744, + "language_loss": 0.67743784, + "learning_rate": 4.2621014619719896e-08, + "loss": 0.69893718, + "num_input_tokens_seen": 168247250, + "step": 7785, + "time_per_iteration": 2.4690158367156982 + }, + { + "auxiliary_loss_clip": 0.01038029, + "auxiliary_loss_mlp": 0.01001393, + "balance_loss_clip": 1.0067811, + "balance_loss_mlp": 1.00048733, + "epoch": 0.9362111465159623, + "flos": 61791421052160.0, + "grad_norm": 0.7231168886449887, + "language_loss": 0.58630222, + "learning_rate": 4.246120457331215e-08, + "loss": 0.60669643, + "num_input_tokens_seen": 168309425, + "step": 7786, + "time_per_iteration": 3.085767984390259 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.01024115, + "balance_loss_clip": 1.04543233, + "balance_loss_mlp": 1.01677513, + "epoch": 0.9363313894066013, + "flos": 24170395368960.0, + "grad_norm": 2.7735232215143557, + "language_loss": 0.72310555, + "learning_rate": 4.2301691483983325e-08, + "loss": 0.74467486, + "num_input_tokens_seen": 168329545, + "step": 7787, + "time_per_iteration": 2.5015017986297607 + }, + { + "auxiliary_loss_clip": 0.01152399, + "auxiliary_loss_mlp": 0.01022984, + "balance_loss_clip": 1.04464877, + "balance_loss_mlp": 1.01555109, + "epoch": 0.9364516322972404, + "flos": 20120354196480.0, + "grad_norm": 2.5073143857702576, + "language_loss": 0.75994349, + "learning_rate": 4.214247537593163e-08, + "loss": 0.78169727, + "num_input_tokens_seen": 168348795, + "step": 7788, + "time_per_iteration": 2.435617208480835 + }, + { + "auxiliary_loss_clip": 0.01136672, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.04271424, + "balance_loss_mlp": 1.02665269, + "epoch": 0.9365718751878795, + "flos": 20703758895360.0, + "grad_norm": 1.8076671171808698, + "language_loss": 0.80504346, + "learning_rate": 4.1983556273309293e-08, + "loss": 0.82674468, + "num_input_tokens_seen": 168367545, + "step": 7789, + "time_per_iteration": 2.4675133228302 + }, + { + "auxiliary_loss_clip": 0.01167386, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.04741704, + "balance_loss_mlp": 1.02112162, + "epoch": 0.9366921180785186, + "flos": 18655270260480.0, + "grad_norm": 3.11434642827236, + "language_loss": 0.6876986, + "learning_rate": 4.182493420022526e-08, + "loss": 0.70965904, + "num_input_tokens_seen": 168383215, + "step": 7790, + "time_per_iteration": 2.3460826873779297 + }, + { + "auxiliary_loss_clip": 0.01126054, + "auxiliary_loss_mlp": 0.01024757, + "balance_loss_clip": 1.04309714, + "balance_loss_mlp": 1.01791477, + "epoch": 0.9368123609691577, + "flos": 25774955815680.0, + "grad_norm": 1.7006037433620487, + "language_loss": 0.78611624, + "learning_rate": 4.166660918074139e-08, + "loss": 0.8076244, + "num_input_tokens_seen": 168403120, + "step": 7791, + "time_per_iteration": 2.5708913803100586 + }, + { + "auxiliary_loss_clip": 0.01118965, + "auxiliary_loss_mlp": 0.01024519, + "balance_loss_clip": 1.04160142, + "balance_loss_mlp": 1.01745915, + "epoch": 0.9369326038597968, + "flos": 25553386771200.0, + "grad_norm": 1.4620012960436688, + "language_loss": 0.73728013, + "learning_rate": 4.15085812388758e-08, + "loss": 0.75871497, + "num_input_tokens_seen": 168425340, + "step": 7792, + "time_per_iteration": 2.5659966468811035 + }, + { + "auxiliary_loss_clip": 0.0113498, + "auxiliary_loss_mlp": 0.01025806, + "balance_loss_clip": 1.04341435, + "balance_loss_mlp": 1.01883793, + "epoch": 0.9370528467504359, + "flos": 23220019370880.0, + "grad_norm": 1.67810545195276, + "language_loss": 0.78653908, + "learning_rate": 4.135085039860153e-08, + "loss": 0.80814695, + "num_input_tokens_seen": 168444740, + "step": 7793, + "time_per_iteration": 2.4897918701171875 + }, + { + "auxiliary_loss_clip": 0.01138718, + "auxiliary_loss_mlp": 0.0102116, + "balance_loss_clip": 1.04795122, + "balance_loss_mlp": 1.01391828, + "epoch": 0.9371730896410749, + "flos": 24967468120320.0, + "grad_norm": 2.3897843632486113, + "language_loss": 0.78454274, + "learning_rate": 4.1193416683845906e-08, + "loss": 0.8061415, + "num_input_tokens_seen": 168463670, + "step": 7794, + "time_per_iteration": 2.4905483722686768 + }, + { + "auxiliary_loss_clip": 0.01126918, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.04585826, + "balance_loss_mlp": 1.01877379, + "epoch": 0.9372933325317141, + "flos": 15553091134080.0, + "grad_norm": 2.325795915003717, + "language_loss": 0.83366895, + "learning_rate": 4.103628011849136e-08, + "loss": 0.85519016, + "num_input_tokens_seen": 168479030, + "step": 7795, + "time_per_iteration": 2.4702987670898438 + }, + { + "auxiliary_loss_clip": 0.01139317, + "auxiliary_loss_mlp": 0.01023624, + "balance_loss_clip": 1.04500806, + "balance_loss_mlp": 1.01660562, + "epoch": 0.9374135754223532, + "flos": 21871861182720.0, + "grad_norm": 1.9432192115945677, + "language_loss": 0.76117641, + "learning_rate": 4.0879440726375506e-08, + "loss": 0.7828058, + "num_input_tokens_seen": 168496815, + "step": 7796, + "time_per_iteration": 2.4747061729431152 + }, + { + "auxiliary_loss_clip": 0.01133299, + "auxiliary_loss_mlp": 0.01020199, + "balance_loss_clip": 1.04025006, + "balance_loss_mlp": 1.01305223, + "epoch": 0.9375338183129922, + "flos": 22631048064000.0, + "grad_norm": 2.74369260518212, + "language_loss": 0.56226516, + "learning_rate": 4.0722898531291074e-08, + "loss": 0.58380014, + "num_input_tokens_seen": 168514055, + "step": 7797, + "time_per_iteration": 2.475250244140625 + }, + { + "auxiliary_loss_clip": 0.01142699, + "auxiliary_loss_mlp": 0.01023708, + "balance_loss_clip": 1.0441041, + "balance_loss_mlp": 1.0164547, + "epoch": 0.9376540612036314, + "flos": 26104292640000.0, + "grad_norm": 1.942390645137417, + "language_loss": 0.766204, + "learning_rate": 4.0566653556985295e-08, + "loss": 0.78786808, + "num_input_tokens_seen": 168534600, + "step": 7798, + "time_per_iteration": 2.516303777694702 + }, + { + "auxiliary_loss_clip": 0.01083056, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.03850102, + "balance_loss_mlp": 1.02238905, + "epoch": 0.9377743040942704, + "flos": 19717580016000.0, + "grad_norm": 2.6680341454378937, + "language_loss": 0.81443667, + "learning_rate": 4.0410705827159886e-08, + "loss": 0.83557159, + "num_input_tokens_seen": 168551895, + "step": 7799, + "time_per_iteration": 2.5737733840942383 + }, + { + "auxiliary_loss_clip": 0.01133304, + "auxiliary_loss_mlp": 0.01024367, + "balance_loss_clip": 1.04146504, + "balance_loss_mlp": 1.01711297, + "epoch": 0.9378945469849095, + "flos": 15267530010240.0, + "grad_norm": 1.9570826450293635, + "language_loss": 0.7145617, + "learning_rate": 4.0255055365472356e-08, + "loss": 0.7361384, + "num_input_tokens_seen": 168569990, + "step": 7800, + "time_per_iteration": 2.4544239044189453 + }, + { + "auxiliary_loss_clip": 0.01095665, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.03767526, + "balance_loss_mlp": 1.02018118, + "epoch": 0.9380147898755486, + "flos": 20591394174720.0, + "grad_norm": 2.9220867427633412, + "language_loss": 0.74910402, + "learning_rate": 4.009970219553471e-08, + "loss": 0.77033257, + "num_input_tokens_seen": 168586940, + "step": 7801, + "time_per_iteration": 3.3849499225616455 + }, + { + "auxiliary_loss_clip": 0.01154351, + "auxiliary_loss_mlp": 0.01024841, + "balance_loss_clip": 1.04489481, + "balance_loss_mlp": 1.01719368, + "epoch": 0.9381350327661877, + "flos": 26281116316800.0, + "grad_norm": 2.7744341475645493, + "language_loss": 0.77005196, + "learning_rate": 3.99446463409141e-08, + "loss": 0.79184389, + "num_input_tokens_seen": 168604795, + "step": 7802, + "time_per_iteration": 3.2243008613586426 + }, + { + "auxiliary_loss_clip": 0.01154225, + "auxiliary_loss_mlp": 0.01024454, + "balance_loss_clip": 1.04296434, + "balance_loss_mlp": 1.01724207, + "epoch": 0.9382552756568268, + "flos": 23586344225280.0, + "grad_norm": 2.002154923716347, + "language_loss": 0.687397, + "learning_rate": 3.978988782513215e-08, + "loss": 0.70918369, + "num_input_tokens_seen": 168622290, + "step": 7803, + "time_per_iteration": 3.318350315093994 + }, + { + "auxiliary_loss_clip": 0.01153139, + "auxiliary_loss_mlp": 0.01020312, + "balance_loss_clip": 1.04404259, + "balance_loss_mlp": 1.01345742, + "epoch": 0.9383755185474659, + "flos": 28438809275520.0, + "grad_norm": 1.7941594912290635, + "language_loss": 0.76412523, + "learning_rate": 3.963542667166586e-08, + "loss": 0.7858597, + "num_input_tokens_seen": 168642395, + "step": 7804, + "time_per_iteration": 2.520297050476074 + }, + { + "auxiliary_loss_clip": 0.01128937, + "auxiliary_loss_mlp": 0.01025573, + "balance_loss_clip": 1.04824924, + "balance_loss_mlp": 1.0187602, + "epoch": 0.938495761438105, + "flos": 20449583280000.0, + "grad_norm": 1.6813638873015362, + "language_loss": 0.68237638, + "learning_rate": 3.9481262903946486e-08, + "loss": 0.70392156, + "num_input_tokens_seen": 168661840, + "step": 7805, + "time_per_iteration": 2.534358501434326 + }, + { + "auxiliary_loss_clip": 0.01024006, + "auxiliary_loss_mlp": 0.01001157, + "balance_loss_clip": 1.00830948, + "balance_loss_mlp": 1.00019741, + "epoch": 0.938616004328744, + "flos": 69302711658240.0, + "grad_norm": 0.7719606455094693, + "language_loss": 0.54471767, + "learning_rate": 3.932739654536066e-08, + "loss": 0.5649693, + "num_input_tokens_seen": 168724540, + "step": 7806, + "time_per_iteration": 3.108773946762085 + }, + { + "auxiliary_loss_clip": 0.01150496, + "auxiliary_loss_mlp": 0.0102358, + "balance_loss_clip": 1.04647648, + "balance_loss_mlp": 1.01726782, + "epoch": 0.9387362472193832, + "flos": 18911636605440.0, + "grad_norm": 2.13495686068401, + "language_loss": 0.74022591, + "learning_rate": 3.917382761925014e-08, + "loss": 0.76196665, + "num_input_tokens_seen": 168740375, + "step": 7807, + "time_per_iteration": 2.4531922340393066 + }, + { + "auxiliary_loss_clip": 0.01147488, + "auxiliary_loss_mlp": 0.01026001, + "balance_loss_clip": 1.04545498, + "balance_loss_mlp": 1.01929867, + "epoch": 0.9388564901100223, + "flos": 26501967089280.0, + "grad_norm": 1.7085085274545841, + "language_loss": 0.79436463, + "learning_rate": 3.9020556148910754e-08, + "loss": 0.81609952, + "num_input_tokens_seen": 168759730, + "step": 7808, + "time_per_iteration": 2.5222485065460205 + }, + { + "auxiliary_loss_clip": 0.01044044, + "auxiliary_loss_mlp": 0.01000411, + "balance_loss_clip": 1.00848341, + "balance_loss_mlp": 0.99955845, + "epoch": 0.9389767330006613, + "flos": 58941083157120.0, + "grad_norm": 0.7353404238872535, + "language_loss": 0.5674243, + "learning_rate": 3.8867582157593895e-08, + "loss": 0.58786893, + "num_input_tokens_seen": 168813935, + "step": 7809, + "time_per_iteration": 2.924941301345825 + }, + { + "auxiliary_loss_clip": 0.01151369, + "auxiliary_loss_mlp": 0.01021866, + "balance_loss_clip": 1.04829741, + "balance_loss_mlp": 1.0151453, + "epoch": 0.9390969758913005, + "flos": 31102554994560.0, + "grad_norm": 2.5688877658802576, + "language_loss": 0.76330429, + "learning_rate": 3.871490566850544e-08, + "loss": 0.78503668, + "num_input_tokens_seen": 168838145, + "step": 7810, + "time_per_iteration": 3.3108325004577637 + }, + { + "auxiliary_loss_clip": 0.01133487, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.04414129, + "balance_loss_mlp": 1.01631927, + "epoch": 0.9392172187819395, + "flos": 22419391173120.0, + "grad_norm": 1.8026362078527254, + "language_loss": 0.70803982, + "learning_rate": 3.856252670480642e-08, + "loss": 0.72960818, + "num_input_tokens_seen": 168856805, + "step": 7811, + "time_per_iteration": 2.4905502796173096 + }, + { + "auxiliary_loss_clip": 0.01133843, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.0414331, + "balance_loss_mlp": 1.01903772, + "epoch": 0.9393374616725786, + "flos": 19719483436800.0, + "grad_norm": 1.8284271896453064, + "language_loss": 0.81367469, + "learning_rate": 3.841044528961279e-08, + "loss": 0.83527768, + "num_input_tokens_seen": 168874600, + "step": 7812, + "time_per_iteration": 2.485079050064087 + }, + { + "auxiliary_loss_clip": 0.01164374, + "auxiliary_loss_mlp": 0.01022322, + "balance_loss_clip": 1.04488468, + "balance_loss_mlp": 1.01506829, + "epoch": 0.9394577045632178, + "flos": 24170215800960.0, + "grad_norm": 1.9090386087029503, + "language_loss": 0.78516585, + "learning_rate": 3.825866144599477e-08, + "loss": 0.80703282, + "num_input_tokens_seen": 168893655, + "step": 7813, + "time_per_iteration": 2.4292402267456055 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.01021705, + "balance_loss_clip": 1.04269695, + "balance_loss_mlp": 1.01446617, + "epoch": 0.9395779474538568, + "flos": 19023929498880.0, + "grad_norm": 1.9382582998029014, + "language_loss": 0.75169373, + "learning_rate": 3.8107175196978145e-08, + "loss": 0.77327305, + "num_input_tokens_seen": 168909960, + "step": 7814, + "time_per_iteration": 2.447415590286255 + }, + { + "auxiliary_loss_clip": 0.01121879, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.04426503, + "balance_loss_mlp": 1.01953089, + "epoch": 0.9396981903444959, + "flos": 14319129260160.0, + "grad_norm": 4.050239831428167, + "language_loss": 0.76650649, + "learning_rate": 3.7955986565542996e-08, + "loss": 0.78798807, + "num_input_tokens_seen": 168928040, + "step": 7815, + "time_per_iteration": 2.4909706115722656 + }, + { + "auxiliary_loss_clip": 0.01122221, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.04187632, + "balance_loss_mlp": 1.02242899, + "epoch": 0.9398184332351349, + "flos": 34787564202240.0, + "grad_norm": 1.8837717056348675, + "language_loss": 0.68207049, + "learning_rate": 3.780509557462497e-08, + "loss": 0.70358396, + "num_input_tokens_seen": 168948240, + "step": 7816, + "time_per_iteration": 2.6253855228424072 + }, + { + "auxiliary_loss_clip": 0.01132494, + "auxiliary_loss_mlp": 0.01022974, + "balance_loss_clip": 1.04111564, + "balance_loss_mlp": 1.01553535, + "epoch": 0.9399386761257741, + "flos": 25372253462400.0, + "grad_norm": 1.5061363765502904, + "language_loss": 0.75301248, + "learning_rate": 3.765450224711375e-08, + "loss": 0.77456713, + "num_input_tokens_seen": 168968745, + "step": 7817, + "time_per_iteration": 2.5172901153564453 + }, + { + "auxiliary_loss_clip": 0.0113344, + "auxiliary_loss_mlp": 0.01023986, + "balance_loss_clip": 1.04583824, + "balance_loss_mlp": 1.01706004, + "epoch": 0.9400589190164131, + "flos": 27304965584640.0, + "grad_norm": 1.7555954865867462, + "language_loss": 0.79811132, + "learning_rate": 3.750420660585396e-08, + "loss": 0.81968558, + "num_input_tokens_seen": 168990685, + "step": 7818, + "time_per_iteration": 2.539062261581421 + }, + { + "auxiliary_loss_clip": 0.01164381, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.04792666, + "balance_loss_mlp": 1.01726007, + "epoch": 0.9401791619070522, + "flos": 23399859790080.0, + "grad_norm": 2.0165620934241266, + "language_loss": 0.7980516, + "learning_rate": 3.735420867364603e-08, + "loss": 0.81993663, + "num_input_tokens_seen": 169011665, + "step": 7819, + "time_per_iteration": 2.435166835784912 + }, + { + "auxiliary_loss_clip": 0.01085759, + "auxiliary_loss_mlp": 0.01020414, + "balance_loss_clip": 1.03537977, + "balance_loss_mlp": 1.01383698, + "epoch": 0.9402994047976914, + "flos": 35881403120640.0, + "grad_norm": 1.6147118527315714, + "language_loss": 0.6174134, + "learning_rate": 3.7204508473244186e-08, + "loss": 0.63847518, + "num_input_tokens_seen": 169035290, + "step": 7820, + "time_per_iteration": 2.6953890323638916 + }, + { + "auxiliary_loss_clip": 0.01075353, + "auxiliary_loss_mlp": 0.01022725, + "balance_loss_clip": 1.0386436, + "balance_loss_mlp": 1.01636803, + "epoch": 0.9404196476883304, + "flos": 22236821320320.0, + "grad_norm": 3.2616484913639825, + "language_loss": 0.69448584, + "learning_rate": 3.7055106027357395e-08, + "loss": 0.71546662, + "num_input_tokens_seen": 169055155, + "step": 7821, + "time_per_iteration": 2.607365608215332 + }, + { + "auxiliary_loss_clip": 0.011494, + "auxiliary_loss_mlp": 0.01022732, + "balance_loss_clip": 1.04647636, + "balance_loss_mlp": 1.015275, + "epoch": 0.9405398905789695, + "flos": 18915802583040.0, + "grad_norm": 1.9504624223105993, + "language_loss": 0.71733874, + "learning_rate": 3.690600135865063e-08, + "loss": 0.73906004, + "num_input_tokens_seen": 169072080, + "step": 7822, + "time_per_iteration": 2.414651870727539 + }, + { + "auxiliary_loss_clip": 0.01020244, + "auxiliary_loss_mlp": 0.01001222, + "balance_loss_clip": 1.00700426, + "balance_loss_mlp": 1.00032222, + "epoch": 0.9406601334696086, + "flos": 70274130048000.0, + "grad_norm": 0.7955488122472639, + "language_loss": 0.58150923, + "learning_rate": 3.675719448974246e-08, + "loss": 0.60172391, + "num_input_tokens_seen": 169137170, + "step": 7823, + "time_per_iteration": 3.236582040786743 + }, + { + "auxiliary_loss_clip": 0.01106744, + "auxiliary_loss_mlp": 0.00761564, + "balance_loss_clip": 1.04196453, + "balance_loss_mlp": 1.00043607, + "epoch": 0.9407803763602477, + "flos": 22165071903360.0, + "grad_norm": 1.9763820737392357, + "language_loss": 0.60037947, + "learning_rate": 3.6608685443207054e-08, + "loss": 0.6190626, + "num_input_tokens_seen": 169156320, + "step": 7824, + "time_per_iteration": 2.839423418045044 + }, + { + "auxiliary_loss_clip": 0.01125414, + "auxiliary_loss_mlp": 0.0102409, + "balance_loss_clip": 1.04265451, + "balance_loss_mlp": 1.01742303, + "epoch": 0.9409006192508867, + "flos": 18879496911360.0, + "grad_norm": 2.4829460325040564, + "language_loss": 0.66655159, + "learning_rate": 3.646047424157306e-08, + "loss": 0.68804657, + "num_input_tokens_seen": 169173295, + "step": 7825, + "time_per_iteration": 2.4948480129241943 + }, + { + "auxiliary_loss_clip": 0.01137966, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.04580545, + "balance_loss_mlp": 1.01870084, + "epoch": 0.9410208621415259, + "flos": 23368258800000.0, + "grad_norm": 2.5389447741234665, + "language_loss": 0.68397832, + "learning_rate": 3.631256090732382e-08, + "loss": 0.70562041, + "num_input_tokens_seen": 169193755, + "step": 7826, + "time_per_iteration": 2.4857218265533447 + }, + { + "auxiliary_loss_clip": 0.01124, + "auxiliary_loss_mlp": 0.01024014, + "balance_loss_clip": 1.04449153, + "balance_loss_mlp": 1.01751721, + "epoch": 0.941141105032165, + "flos": 22742227635840.0, + "grad_norm": 2.0194203161119013, + "language_loss": 0.82629824, + "learning_rate": 3.6164945462897833e-08, + "loss": 0.84777832, + "num_input_tokens_seen": 169213045, + "step": 7827, + "time_per_iteration": 2.547708034515381 + }, + { + "auxiliary_loss_clip": 0.01149818, + "auxiliary_loss_mlp": 0.00761317, + "balance_loss_clip": 1.04693913, + "balance_loss_mlp": 1.00037503, + "epoch": 0.941261347922804, + "flos": 20704908130560.0, + "grad_norm": 1.65488498100959, + "language_loss": 0.75595284, + "learning_rate": 3.6017627930687856e-08, + "loss": 0.77506417, + "num_input_tokens_seen": 169232870, + "step": 7828, + "time_per_iteration": 3.2857019901275635 + }, + { + "auxiliary_loss_clip": 0.01105444, + "auxiliary_loss_mlp": 0.01021398, + "balance_loss_clip": 1.03908086, + "balance_loss_mlp": 1.01471066, + "epoch": 0.9413815908134432, + "flos": 19421998997760.0, + "grad_norm": 2.0088585946240354, + "language_loss": 0.77042317, + "learning_rate": 3.587060833304267e-08, + "loss": 0.79169154, + "num_input_tokens_seen": 169251060, + "step": 7829, + "time_per_iteration": 4.3125159740448 + }, + { + "auxiliary_loss_clip": 0.01153841, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.04700339, + "balance_loss_mlp": 1.01793277, + "epoch": 0.9415018337040822, + "flos": 17493452853120.0, + "grad_norm": 1.9794684696201283, + "language_loss": 0.64035976, + "learning_rate": 3.5723886692264225e-08, + "loss": 0.66214889, + "num_input_tokens_seen": 169268600, + "step": 7830, + "time_per_iteration": 2.4527854919433594 + }, + { + "auxiliary_loss_clip": 0.01131958, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.04079461, + "balance_loss_mlp": 1.01860428, + "epoch": 0.9416220765947213, + "flos": 31831613343360.0, + "grad_norm": 2.0541751213589077, + "language_loss": 0.62432235, + "learning_rate": 3.557746303061071e-08, + "loss": 0.64589387, + "num_input_tokens_seen": 169290355, + "step": 7831, + "time_per_iteration": 2.593571901321411 + }, + { + "auxiliary_loss_clip": 0.01133843, + "auxiliary_loss_mlp": 0.01021214, + "balance_loss_clip": 1.04275656, + "balance_loss_mlp": 1.01470542, + "epoch": 0.9417423194853605, + "flos": 23511973115520.0, + "grad_norm": 1.5962669450845155, + "language_loss": 0.72410691, + "learning_rate": 3.543133737029391e-08, + "loss": 0.74565744, + "num_input_tokens_seen": 169310865, + "step": 7832, + "time_per_iteration": 2.5236294269561768 + }, + { + "auxiliary_loss_clip": 0.01154959, + "auxiliary_loss_mlp": 0.01022886, + "balance_loss_clip": 1.04571009, + "balance_loss_mlp": 1.01582038, + "epoch": 0.9418625623759995, + "flos": 23915106432000.0, + "grad_norm": 2.718070526543077, + "language_loss": 0.69199193, + "learning_rate": 3.5285509733481214e-08, + "loss": 0.71377039, + "num_input_tokens_seen": 169330590, + "step": 7833, + "time_per_iteration": 2.481441020965576 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.04372609, + "balance_loss_mlp": 1.01955175, + "epoch": 0.9419828052666386, + "flos": 18076965292800.0, + "grad_norm": 2.151817133250693, + "language_loss": 0.76551831, + "learning_rate": 3.513998014229469e-08, + "loss": 0.78726256, + "num_input_tokens_seen": 169349540, + "step": 7834, + "time_per_iteration": 2.424168825149536 + }, + { + "auxiliary_loss_clip": 0.01137357, + "auxiliary_loss_mlp": 0.01023545, + "balance_loss_clip": 1.04495227, + "balance_loss_mlp": 1.01679182, + "epoch": 0.9421030481572777, + "flos": 17712328377600.0, + "grad_norm": 2.578003550508524, + "language_loss": 0.86075974, + "learning_rate": 3.499474861881069e-08, + "loss": 0.88236868, + "num_input_tokens_seen": 169366765, + "step": 7835, + "time_per_iteration": 2.4340837001800537 + }, + { + "auxiliary_loss_clip": 0.01095575, + "auxiliary_loss_mlp": 0.01019496, + "balance_loss_clip": 1.04073191, + "balance_loss_mlp": 1.01264429, + "epoch": 0.9422232910479168, + "flos": 20194114775040.0, + "grad_norm": 1.8416254845304234, + "language_loss": 0.67910206, + "learning_rate": 3.4849815185061136e-08, + "loss": 0.70025283, + "num_input_tokens_seen": 169386655, + "step": 7836, + "time_per_iteration": 2.6137242317199707 + }, + { + "auxiliary_loss_clip": 0.01147591, + "auxiliary_loss_mlp": 0.01021292, + "balance_loss_clip": 1.04196286, + "balance_loss_mlp": 1.0149473, + "epoch": 0.9423435339385559, + "flos": 18442571875200.0, + "grad_norm": 1.9351711929411826, + "language_loss": 0.76200539, + "learning_rate": 3.470517986303223e-08, + "loss": 0.78369421, + "num_input_tokens_seen": 169405640, + "step": 7837, + "time_per_iteration": 3.2082364559173584 + }, + { + "auxiliary_loss_clip": 0.0112106, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.04501939, + "balance_loss_mlp": 1.02684546, + "epoch": 0.942463776829195, + "flos": 20080636732800.0, + "grad_norm": 1.8018896076547664, + "language_loss": 0.79175133, + "learning_rate": 3.4560842674664856e-08, + "loss": 0.81330073, + "num_input_tokens_seen": 169424155, + "step": 7838, + "time_per_iteration": 2.509373188018799 + }, + { + "auxiliary_loss_clip": 0.01152592, + "auxiliary_loss_mlp": 0.01020774, + "balance_loss_clip": 1.04382777, + "balance_loss_mlp": 1.01363909, + "epoch": 0.9425840197198341, + "flos": 22636255536000.0, + "grad_norm": 1.7856425887286214, + "language_loss": 0.75214523, + "learning_rate": 3.441680364185506e-08, + "loss": 0.77387887, + "num_input_tokens_seen": 169444025, + "step": 7839, + "time_per_iteration": 2.45835280418396 + }, + { + "auxiliary_loss_clip": 0.01141236, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.0474081, + "balance_loss_mlp": 1.0212183, + "epoch": 0.9427042626104731, + "flos": 19937892084480.0, + "grad_norm": 2.1279493500966087, + "language_loss": 0.75157106, + "learning_rate": 3.427306278645314e-08, + "loss": 0.77326781, + "num_input_tokens_seen": 169462480, + "step": 7840, + "time_per_iteration": 2.4605257511138916 + }, + { + "auxiliary_loss_clip": 0.01108333, + "auxiliary_loss_mlp": 0.01021506, + "balance_loss_clip": 1.04143429, + "balance_loss_mlp": 1.01475585, + "epoch": 0.9428245055011123, + "flos": 22856998567680.0, + "grad_norm": 3.1156319889505286, + "language_loss": 0.72698045, + "learning_rate": 3.4129620130264767e-08, + "loss": 0.74827886, + "num_input_tokens_seen": 169480840, + "step": 7841, + "time_per_iteration": 2.539158821105957 + }, + { + "auxiliary_loss_clip": 0.01142726, + "auxiliary_loss_mlp": 0.00761415, + "balance_loss_clip": 1.04810536, + "balance_loss_mlp": 1.00046575, + "epoch": 0.9429447483917514, + "flos": 20951757371520.0, + "grad_norm": 2.2051420720485986, + "language_loss": 0.78080136, + "learning_rate": 3.398647569505009e-08, + "loss": 0.79984283, + "num_input_tokens_seen": 169498265, + "step": 7842, + "time_per_iteration": 2.4850659370422363 + }, + { + "auxiliary_loss_clip": 0.01127832, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.04265666, + "balance_loss_mlp": 1.01764941, + "epoch": 0.9430649912823904, + "flos": 18843658116480.0, + "grad_norm": 2.458175718296991, + "language_loss": 0.74562967, + "learning_rate": 3.384362950252373e-08, + "loss": 0.76715755, + "num_input_tokens_seen": 169515235, + "step": 7843, + "time_per_iteration": 2.502018928527832 + }, + { + "auxiliary_loss_clip": 0.01132468, + "auxiliary_loss_mlp": 0.01021218, + "balance_loss_clip": 1.04088688, + "balance_loss_mlp": 1.01443481, + "epoch": 0.9431852341730296, + "flos": 32556038837760.0, + "grad_norm": 2.1438351596002847, + "language_loss": 0.56783378, + "learning_rate": 3.3701081574355473e-08, + "loss": 0.58937061, + "num_input_tokens_seen": 169537195, + "step": 7844, + "time_per_iteration": 2.6157724857330322 + }, + { + "auxiliary_loss_clip": 0.01044108, + "auxiliary_loss_mlp": 0.01000561, + "balance_loss_clip": 1.00895619, + "balance_loss_mlp": 0.99973881, + "epoch": 0.9433054770636686, + "flos": 66904490252160.0, + "grad_norm": 0.6389719999319686, + "language_loss": 0.51683843, + "learning_rate": 3.3558831932169796e-08, + "loss": 0.53728509, + "num_input_tokens_seen": 169605865, + "step": 7845, + "time_per_iteration": 3.1352052688598633 + }, + { + "auxiliary_loss_clip": 0.01147098, + "auxiliary_loss_mlp": 0.01022537, + "balance_loss_clip": 1.0436132, + "balance_loss_mlp": 1.01576877, + "epoch": 0.9434257199543077, + "flos": 26140346916480.0, + "grad_norm": 1.8783722097149476, + "language_loss": 0.88667786, + "learning_rate": 3.341688059754588e-08, + "loss": 0.90837425, + "num_input_tokens_seen": 169621520, + "step": 7846, + "time_per_iteration": 2.4822561740875244 + }, + { + "auxiliary_loss_clip": 0.01128338, + "auxiliary_loss_mlp": 0.00761133, + "balance_loss_clip": 1.0414772, + "balance_loss_mlp": 1.00044942, + "epoch": 0.9435459628449467, + "flos": 25003486483200.0, + "grad_norm": 2.487402269255585, + "language_loss": 0.7776705, + "learning_rate": 3.327522759201762e-08, + "loss": 0.79656523, + "num_input_tokens_seen": 169641390, + "step": 7847, + "time_per_iteration": 2.557314872741699 + }, + { + "auxiliary_loss_clip": 0.01123359, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.04450023, + "balance_loss_mlp": 1.01948535, + "epoch": 0.9436662057355859, + "flos": 22163240309760.0, + "grad_norm": 2.1591998272621566, + "language_loss": 0.67063391, + "learning_rate": 3.313387293707359e-08, + "loss": 0.69213784, + "num_input_tokens_seen": 169660095, + "step": 7848, + "time_per_iteration": 2.5176422595977783 + }, + { + "auxiliary_loss_clip": 0.011214, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.04487526, + "balance_loss_mlp": 1.02023673, + "epoch": 0.943786448626225, + "flos": 20118522602880.0, + "grad_norm": 2.23324976364037, + "language_loss": 0.68399274, + "learning_rate": 3.29928166541571e-08, + "loss": 0.70548785, + "num_input_tokens_seen": 169679050, + "step": 7849, + "time_per_iteration": 2.5070457458496094 + }, + { + "auxiliary_loss_clip": 0.0112696, + "auxiliary_loss_mlp": 0.01022574, + "balance_loss_clip": 1.04282618, + "balance_loss_mlp": 1.01555312, + "epoch": 0.943906691516864, + "flos": 22090808534400.0, + "grad_norm": 2.0133440353510057, + "language_loss": 0.80231166, + "learning_rate": 3.2852058764666346e-08, + "loss": 0.823807, + "num_input_tokens_seen": 169698150, + "step": 7850, + "time_per_iteration": 2.4874753952026367 + }, + { + "auxiliary_loss_clip": 0.01111345, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.04403138, + "balance_loss_mlp": 1.02015352, + "epoch": 0.9440269344075032, + "flos": 35298501212160.0, + "grad_norm": 2.561723524048758, + "language_loss": 0.68113899, + "learning_rate": 3.2711599289954264e-08, + "loss": 0.70252264, + "num_input_tokens_seen": 169722185, + "step": 7851, + "time_per_iteration": 2.6302433013916016 + }, + { + "auxiliary_loss_clip": 0.01095275, + "auxiliary_loss_mlp": 0.01029302, + "balance_loss_clip": 1.03962159, + "balance_loss_mlp": 1.02248001, + "epoch": 0.9441471772981422, + "flos": 19238136255360.0, + "grad_norm": 1.8332428705005133, + "language_loss": 0.7770437, + "learning_rate": 3.257143825132847e-08, + "loss": 0.79828948, + "num_input_tokens_seen": 169740355, + "step": 7852, + "time_per_iteration": 2.5932705402374268 + }, + { + "auxiliary_loss_clip": 0.01136856, + "auxiliary_loss_mlp": 0.01022315, + "balance_loss_clip": 1.04454064, + "balance_loss_mlp": 1.01548481, + "epoch": 0.9442674201887813, + "flos": 25739799379200.0, + "grad_norm": 4.394625244606911, + "language_loss": 0.76086712, + "learning_rate": 3.243157567005106e-08, + "loss": 0.78245878, + "num_input_tokens_seen": 169758535, + "step": 7853, + "time_per_iteration": 2.51511287689209 + }, + { + "auxiliary_loss_clip": 0.01170208, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.05040717, + "balance_loss_mlp": 1.01894498, + "epoch": 0.9443876630794205, + "flos": 15523321737600.0, + "grad_norm": 4.034205100257239, + "language_loss": 0.63952571, + "learning_rate": 3.2292011567339296e-08, + "loss": 0.66149056, + "num_input_tokens_seen": 169776340, + "step": 7854, + "time_per_iteration": 3.224992036819458 + }, + { + "auxiliary_loss_clip": 0.01151649, + "auxiliary_loss_mlp": 0.00761541, + "balance_loss_clip": 1.04486895, + "balance_loss_mlp": 1.00043523, + "epoch": 0.9445079059700595, + "flos": 13400821128960.0, + "grad_norm": 2.1489651399515006, + "language_loss": 0.55808181, + "learning_rate": 3.21527459643649e-08, + "loss": 0.5772137, + "num_input_tokens_seen": 169793225, + "step": 7855, + "time_per_iteration": 3.1556334495544434 + }, + { + "auxiliary_loss_clip": 0.01153853, + "auxiliary_loss_mlp": 0.01023941, + "balance_loss_clip": 1.04605937, + "balance_loss_mlp": 1.01671994, + "epoch": 0.9446281488606986, + "flos": 23659242877440.0, + "grad_norm": 2.310149782181084, + "language_loss": 0.74146456, + "learning_rate": 3.2013778882254536e-08, + "loss": 0.76324248, + "num_input_tokens_seen": 169812020, + "step": 7856, + "time_per_iteration": 3.2867541313171387 + }, + { + "auxiliary_loss_clip": 0.01142727, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.04364061, + "balance_loss_mlp": 1.02255809, + "epoch": 0.9447483917513377, + "flos": 25557337267200.0, + "grad_norm": 1.7366303690542395, + "language_loss": 0.75807232, + "learning_rate": 3.1875110342088676e-08, + "loss": 0.7797935, + "num_input_tokens_seen": 169833470, + "step": 7857, + "time_per_iteration": 2.5026652812957764 + }, + { + "auxiliary_loss_clip": 0.01133024, + "auxiliary_loss_mlp": 0.01020985, + "balance_loss_clip": 1.04511595, + "balance_loss_mlp": 1.01433003, + "epoch": 0.9448686346419768, + "flos": 24535463247360.0, + "grad_norm": 1.6502779326658825, + "language_loss": 0.65774155, + "learning_rate": 3.1736740364904035e-08, + "loss": 0.67928159, + "num_input_tokens_seen": 169854000, + "step": 7858, + "time_per_iteration": 2.4973976612091064 + }, + { + "auxiliary_loss_clip": 0.01104734, + "auxiliary_loss_mlp": 0.0076186, + "balance_loss_clip": 1.04017067, + "balance_loss_mlp": 1.00051117, + "epoch": 0.9449888775326158, + "flos": 14721256995840.0, + "grad_norm": 2.04362269263793, + "language_loss": 0.77462316, + "learning_rate": 3.159866897169094e-08, + "loss": 0.79328907, + "num_input_tokens_seen": 169872200, + "step": 7859, + "time_per_iteration": 2.543001174926758 + }, + { + "auxiliary_loss_clip": 0.01130372, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.04449677, + "balance_loss_mlp": 1.0182246, + "epoch": 0.945109120423255, + "flos": 15447873219840.0, + "grad_norm": 1.7353632625093676, + "language_loss": 0.75522542, + "learning_rate": 3.146089618339487e-08, + "loss": 0.77678096, + "num_input_tokens_seen": 169889055, + "step": 7860, + "time_per_iteration": 2.4711670875549316 + }, + { + "auxiliary_loss_clip": 0.01124224, + "auxiliary_loss_mlp": 0.0102003, + "balance_loss_clip": 1.04256558, + "balance_loss_mlp": 1.01294041, + "epoch": 0.9452293633138941, + "flos": 25448097029760.0, + "grad_norm": 2.402187216531958, + "language_loss": 0.67991292, + "learning_rate": 3.132342202091554e-08, + "loss": 0.70135552, + "num_input_tokens_seen": 169909280, + "step": 7861, + "time_per_iteration": 2.54388427734375 + }, + { + "auxiliary_loss_clip": 0.01165571, + "auxiliary_loss_mlp": 0.01024379, + "balance_loss_clip": 1.04651487, + "balance_loss_mlp": 1.01716661, + "epoch": 0.9453496062045331, + "flos": 21215342350080.0, + "grad_norm": 2.0322010002819706, + "language_loss": 0.68738711, + "learning_rate": 3.1186246505107595e-08, + "loss": 0.70928663, + "num_input_tokens_seen": 169928420, + "step": 7862, + "time_per_iteration": 2.4076900482177734 + }, + { + "auxiliary_loss_clip": 0.01152482, + "auxiliary_loss_mlp": 0.01024305, + "balance_loss_clip": 1.04783988, + "balance_loss_mlp": 1.01694083, + "epoch": 0.9454698490951723, + "flos": 20010898477440.0, + "grad_norm": 1.751808450621398, + "language_loss": 0.8378222, + "learning_rate": 3.104936965678084e-08, + "loss": 0.85959011, + "num_input_tokens_seen": 169946750, + "step": 7863, + "time_per_iteration": 3.22268009185791 + }, + { + "auxiliary_loss_clip": 0.01149984, + "auxiliary_loss_mlp": 0.01021535, + "balance_loss_clip": 1.04435408, + "balance_loss_mlp": 1.01422191, + "epoch": 0.9455900919858113, + "flos": 21069652786560.0, + "grad_norm": 1.9975630592408675, + "language_loss": 0.81775296, + "learning_rate": 3.091279149669956e-08, + "loss": 0.83946812, + "num_input_tokens_seen": 169965540, + "step": 7864, + "time_per_iteration": 2.4439079761505127 + }, + { + "auxiliary_loss_clip": 0.01150629, + "auxiliary_loss_mlp": 0.00761415, + "balance_loss_clip": 1.04563642, + "balance_loss_mlp": 1.00042129, + "epoch": 0.9457103348764504, + "flos": 20740854666240.0, + "grad_norm": 1.759538006397204, + "language_loss": 0.73269653, + "learning_rate": 3.0776512045581624e-08, + "loss": 0.75181699, + "num_input_tokens_seen": 169984330, + "step": 7865, + "time_per_iteration": 2.4579951763153076 + }, + { + "auxiliary_loss_clip": 0.01131592, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.04380536, + "balance_loss_mlp": 1.02014744, + "epoch": 0.9458305777670896, + "flos": 21428363957760.0, + "grad_norm": 2.1695022760122167, + "language_loss": 0.77613556, + "learning_rate": 3.0640531324101384e-08, + "loss": 0.79772586, + "num_input_tokens_seen": 170002095, + "step": 7866, + "time_per_iteration": 2.462996244430542 + }, + { + "auxiliary_loss_clip": 0.01155181, + "auxiliary_loss_mlp": 0.01024206, + "balance_loss_clip": 1.04993081, + "balance_loss_mlp": 1.01644576, + "epoch": 0.9459508206577286, + "flos": 20011185786240.0, + "grad_norm": 1.8162068436731986, + "language_loss": 0.76079047, + "learning_rate": 3.0504849352886554e-08, + "loss": 0.78258437, + "num_input_tokens_seen": 170020240, + "step": 7867, + "time_per_iteration": 2.4411990642547607 + }, + { + "auxiliary_loss_clip": 0.01151334, + "auxiliary_loss_mlp": 0.01021391, + "balance_loss_clip": 1.04655886, + "balance_loss_mlp": 1.01462269, + "epoch": 0.9460710635483677, + "flos": 12166428291840.0, + "grad_norm": 9.210517222510402, + "language_loss": 0.712843, + "learning_rate": 3.036946615252023e-08, + "loss": 0.73457026, + "num_input_tokens_seen": 170035770, + "step": 7868, + "time_per_iteration": 2.4180634021759033 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_clip": 1.04395854, + "balance_loss_mlp": 1.01774037, + "epoch": 0.9461913064390068, + "flos": 34276196229120.0, + "grad_norm": 2.3525004973140993, + "language_loss": 0.67115986, + "learning_rate": 3.0234381743539984e-08, + "loss": 0.69281471, + "num_input_tokens_seen": 170053385, + "step": 7869, + "time_per_iteration": 2.5811891555786133 + }, + { + "auxiliary_loss_clip": 0.01142579, + "auxiliary_loss_mlp": 0.01021928, + "balance_loss_clip": 1.04408407, + "balance_loss_mlp": 1.0148561, + "epoch": 0.9463115493296459, + "flos": 19463763536640.0, + "grad_norm": 2.0340607844145624, + "language_loss": 0.80144501, + "learning_rate": 3.0099596146437863e-08, + "loss": 0.82309002, + "num_input_tokens_seen": 170070490, + "step": 7870, + "time_per_iteration": 2.476649522781372 + }, + { + "auxiliary_loss_clip": 0.01060806, + "auxiliary_loss_mlp": 0.01000792, + "balance_loss_clip": 1.00727773, + "balance_loss_mlp": 0.99993342, + "epoch": 0.946431792220285, + "flos": 70570824387840.0, + "grad_norm": 0.7681569221726348, + "language_loss": 0.60079873, + "learning_rate": 2.996510938166086e-08, + "loss": 0.62141472, + "num_input_tokens_seen": 170133465, + "step": 7871, + "time_per_iteration": 3.0822348594665527 + }, + { + "auxiliary_loss_clip": 0.01148955, + "auxiliary_loss_mlp": 0.0102205, + "balance_loss_clip": 1.04755247, + "balance_loss_mlp": 1.01548183, + "epoch": 0.9465520351109241, + "flos": 18947906363520.0, + "grad_norm": 1.8692398209276266, + "language_loss": 0.7330935, + "learning_rate": 2.983092146960997e-08, + "loss": 0.75480354, + "num_input_tokens_seen": 170150810, + "step": 7872, + "time_per_iteration": 2.435206413269043 + }, + { + "auxiliary_loss_clip": 0.01137302, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.04230714, + "balance_loss_mlp": 1.02062798, + "epoch": 0.9466722780015632, + "flos": 19135647774720.0, + "grad_norm": 1.9942096704682952, + "language_loss": 0.80014908, + "learning_rate": 2.9697032430642256e-08, + "loss": 0.82180655, + "num_input_tokens_seen": 170169025, + "step": 7873, + "time_per_iteration": 2.4758474826812744 + }, + { + "auxiliary_loss_clip": 0.01161119, + "auxiliary_loss_mlp": 0.0102198, + "balance_loss_clip": 1.04604065, + "balance_loss_mlp": 1.01557302, + "epoch": 0.9467925208922022, + "flos": 17237912520960.0, + "grad_norm": 2.3568464725218714, + "language_loss": 0.73452997, + "learning_rate": 2.9563442285067906e-08, + "loss": 0.75636089, + "num_input_tokens_seen": 170186070, + "step": 7874, + "time_per_iteration": 2.370945692062378 + }, + { + "auxiliary_loss_clip": 0.01153819, + "auxiliary_loss_mlp": 0.01023831, + "balance_loss_clip": 1.0470686, + "balance_loss_mlp": 1.0165503, + "epoch": 0.9469127637828414, + "flos": 29169016859520.0, + "grad_norm": 2.236254873262255, + "language_loss": 0.79644465, + "learning_rate": 2.943015105315294e-08, + "loss": 0.81822109, + "num_input_tokens_seen": 170206265, + "step": 7875, + "time_per_iteration": 2.5071771144866943 + }, + { + "auxiliary_loss_clip": 0.01108762, + "auxiliary_loss_mlp": 0.01023267, + "balance_loss_clip": 1.03932202, + "balance_loss_mlp": 1.0153513, + "epoch": 0.9470330066734804, + "flos": 26030460234240.0, + "grad_norm": 3.049599271476982, + "language_loss": 0.6647172, + "learning_rate": 2.929715875511718e-08, + "loss": 0.68603754, + "num_input_tokens_seen": 170225300, + "step": 7876, + "time_per_iteration": 2.5589890480041504 + }, + { + "auxiliary_loss_clip": 0.0115107, + "auxiliary_loss_mlp": 0.01022888, + "balance_loss_clip": 1.04243565, + "balance_loss_mlp": 1.01573563, + "epoch": 0.9471532495641195, + "flos": 23440906056960.0, + "grad_norm": 1.815602305408838, + "language_loss": 0.69892883, + "learning_rate": 2.9164465411135375e-08, + "loss": 0.72066844, + "num_input_tokens_seen": 170245070, + "step": 7877, + "time_per_iteration": 2.4514646530151367 + }, + { + "auxiliary_loss_clip": 0.011531, + "auxiliary_loss_mlp": 0.01021538, + "balance_loss_clip": 1.04796863, + "balance_loss_mlp": 1.01454651, + "epoch": 0.9472734924547586, + "flos": 15815850099840.0, + "grad_norm": 1.7370293766497769, + "language_loss": 0.80699301, + "learning_rate": 2.9032071041337426e-08, + "loss": 0.82873929, + "num_input_tokens_seen": 170263305, + "step": 7878, + "time_per_iteration": 2.4093966484069824 + }, + { + "auxiliary_loss_clip": 0.01130405, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.04324424, + "balance_loss_mlp": 1.0203445, + "epoch": 0.9473937353453977, + "flos": 11181793697280.0, + "grad_norm": 1.8808750636263978, + "language_loss": 0.73073232, + "learning_rate": 2.889997566580704e-08, + "loss": 0.75230843, + "num_input_tokens_seen": 170281460, + "step": 7879, + "time_per_iteration": 2.4499850273132324 + }, + { + "auxiliary_loss_clip": 0.01165363, + "auxiliary_loss_mlp": 0.0102494, + "balance_loss_clip": 1.04604936, + "balance_loss_mlp": 1.01741755, + "epoch": 0.9475139782360368, + "flos": 25775530433280.0, + "grad_norm": 1.5926766350699328, + "language_loss": 0.7053048, + "learning_rate": 2.8768179304583086e-08, + "loss": 0.72720784, + "num_input_tokens_seen": 170303515, + "step": 7880, + "time_per_iteration": 2.4577770233154297 + }, + { + "auxiliary_loss_clip": 0.01124808, + "auxiliary_loss_mlp": 0.01026758, + "balance_loss_clip": 1.04575443, + "balance_loss_mlp": 1.01992762, + "epoch": 0.9476342211266758, + "flos": 22820046451200.0, + "grad_norm": 1.833557670925971, + "language_loss": 0.73398244, + "learning_rate": 2.8636681977659117e-08, + "loss": 0.75549817, + "num_input_tokens_seen": 170323165, + "step": 7881, + "time_per_iteration": 3.3875198364257812 + }, + { + "auxiliary_loss_clip": 0.01107102, + "auxiliary_loss_mlp": 0.01026972, + "balance_loss_clip": 1.04396105, + "balance_loss_mlp": 1.01984334, + "epoch": 0.947754464017315, + "flos": 20193611984640.0, + "grad_norm": 2.6093908905250554, + "language_loss": 0.78018767, + "learning_rate": 2.850548370498318e-08, + "loss": 0.80152845, + "num_input_tokens_seen": 170341005, + "step": 7882, + "time_per_iteration": 3.291841983795166 + }, + { + "auxiliary_loss_clip": 0.01147573, + "auxiliary_loss_mlp": 0.01020507, + "balance_loss_clip": 1.04203844, + "balance_loss_mlp": 1.01399851, + "epoch": 0.9478747069079541, + "flos": 24717925359360.0, + "grad_norm": 4.0760837896489095, + "language_loss": 0.71137774, + "learning_rate": 2.8374584506457798e-08, + "loss": 0.73305857, + "num_input_tokens_seen": 170362280, + "step": 7883, + "time_per_iteration": 3.295448064804077 + }, + { + "auxiliary_loss_clip": 0.01135331, + "auxiliary_loss_mlp": 0.01020674, + "balance_loss_clip": 1.04536188, + "balance_loss_mlp": 1.01327085, + "epoch": 0.9479949497985931, + "flos": 21361355136000.0, + "grad_norm": 2.475995640123661, + "language_loss": 0.67193711, + "learning_rate": 2.824398440193998e-08, + "loss": 0.69349718, + "num_input_tokens_seen": 170381080, + "step": 7884, + "time_per_iteration": 2.469330072402954 + }, + { + "auxiliary_loss_clip": 0.01106307, + "auxiliary_loss_mlp": 0.01025589, + "balance_loss_clip": 1.04240775, + "balance_loss_mlp": 1.01794779, + "epoch": 0.9481151926892323, + "flos": 18148606968960.0, + "grad_norm": 3.7627254362232887, + "language_loss": 0.71908408, + "learning_rate": 2.811368341124232e-08, + "loss": 0.74040306, + "num_input_tokens_seen": 170400150, + "step": 7885, + "time_per_iteration": 2.54787015914917 + }, + { + "auxiliary_loss_clip": 0.0113517, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.04399848, + "balance_loss_mlp": 1.02203727, + "epoch": 0.9482354355798713, + "flos": 22128012046080.0, + "grad_norm": 2.1640753261137786, + "language_loss": 0.68272167, + "learning_rate": 2.7983681554131222e-08, + "loss": 0.70436406, + "num_input_tokens_seen": 170420410, + "step": 7886, + "time_per_iteration": 2.4963207244873047 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.04341221, + "balance_loss_mlp": 1.01740503, + "epoch": 0.9483556784705104, + "flos": 19063072344960.0, + "grad_norm": 3.9878115921925037, + "language_loss": 0.708947, + "learning_rate": 2.7853978850327365e-08, + "loss": 0.73054409, + "num_input_tokens_seen": 170439580, + "step": 7887, + "time_per_iteration": 2.463390827178955 + }, + { + "auxiliary_loss_clip": 0.0112247, + "auxiliary_loss_mlp": 0.01022628, + "balance_loss_clip": 1.04718244, + "balance_loss_mlp": 1.01565993, + "epoch": 0.9484759213611496, + "flos": 25777110631680.0, + "grad_norm": 1.8949550063773881, + "language_loss": 0.87284803, + "learning_rate": 2.7724575319507225e-08, + "loss": 0.89429903, + "num_input_tokens_seen": 170459290, + "step": 7888, + "time_per_iteration": 2.5543770790100098 + }, + { + "auxiliary_loss_clip": 0.01149039, + "auxiliary_loss_mlp": 0.01022585, + "balance_loss_clip": 1.04298997, + "balance_loss_mlp": 1.01584625, + "epoch": 0.9485961642517886, + "flos": 20667740532480.0, + "grad_norm": 1.7886601044194215, + "language_loss": 0.77062273, + "learning_rate": 2.759547098130044e-08, + "loss": 0.79233897, + "num_input_tokens_seen": 170478020, + "step": 7889, + "time_per_iteration": 2.4523630142211914 + }, + { + "auxiliary_loss_clip": 0.01161788, + "auxiliary_loss_mlp": 0.01023784, + "balance_loss_clip": 1.04602659, + "balance_loss_mlp": 1.01664329, + "epoch": 0.9487164071424277, + "flos": 22674069578880.0, + "grad_norm": 2.415688712473607, + "language_loss": 0.76696026, + "learning_rate": 2.746666585529267e-08, + "loss": 0.78881598, + "num_input_tokens_seen": 170498295, + "step": 7890, + "time_per_iteration": 3.210886240005493 + }, + { + "auxiliary_loss_clip": 0.01142561, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.04321933, + "balance_loss_mlp": 1.0180279, + "epoch": 0.9488366500330668, + "flos": 38726461716480.0, + "grad_norm": 2.0665936921282975, + "language_loss": 0.74251044, + "learning_rate": 2.73381599610234e-08, + "loss": 0.76418686, + "num_input_tokens_seen": 170518695, + "step": 7891, + "time_per_iteration": 2.6543033123016357 + }, + { + "auxiliary_loss_clip": 0.01144506, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.04194331, + "balance_loss_mlp": 1.01799703, + "epoch": 0.9489568929237059, + "flos": 27890920149120.0, + "grad_norm": 1.6947393843977296, + "language_loss": 0.71408939, + "learning_rate": 2.7209953317987033e-08, + "loss": 0.73578954, + "num_input_tokens_seen": 170539735, + "step": 7892, + "time_per_iteration": 2.508558511734009 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01021677, + "balance_loss_clip": 1.04643822, + "balance_loss_mlp": 1.01466143, + "epoch": 0.9490771358143449, + "flos": 33580642291200.0, + "grad_norm": 3.0216657475812747, + "language_loss": 0.78105861, + "learning_rate": 2.7082045945631793e-08, + "loss": 0.80279124, + "num_input_tokens_seen": 170561950, + "step": 7893, + "time_per_iteration": 2.5604779720306396 + }, + { + "auxiliary_loss_clip": 0.01114849, + "auxiliary_loss_mlp": 0.01022422, + "balance_loss_clip": 1.04189968, + "balance_loss_mlp": 1.01517975, + "epoch": 0.9491973787049841, + "flos": 14793796512000.0, + "grad_norm": 2.18425932258128, + "language_loss": 0.69260025, + "learning_rate": 2.6954437863361712e-08, + "loss": 0.71397299, + "num_input_tokens_seen": 170579865, + "step": 7894, + "time_per_iteration": 2.504115104675293 + }, + { + "auxiliary_loss_clip": 0.01095179, + "auxiliary_loss_mlp": 0.01021642, + "balance_loss_clip": 1.03972673, + "balance_loss_mlp": 1.01520801, + "epoch": 0.9493176215956232, + "flos": 25332535998720.0, + "grad_norm": 2.1189384699708183, + "language_loss": 0.70841491, + "learning_rate": 2.6827129090534862e-08, + "loss": 0.72958314, + "num_input_tokens_seen": 170600165, + "step": 7895, + "time_per_iteration": 2.644534111022949 + }, + { + "auxiliary_loss_clip": 0.01135989, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.04632664, + "balance_loss_mlp": 1.019279, + "epoch": 0.9494378644862622, + "flos": 21029971236480.0, + "grad_norm": 2.0963763626508687, + "language_loss": 0.77955866, + "learning_rate": 2.670011964646335e-08, + "loss": 0.80118608, + "num_input_tokens_seen": 170618845, + "step": 7896, + "time_per_iteration": 2.4762024879455566 + }, + { + "auxiliary_loss_clip": 0.01083493, + "auxiliary_loss_mlp": 0.01023583, + "balance_loss_clip": 1.03318751, + "balance_loss_mlp": 1.01596892, + "epoch": 0.9495581073769014, + "flos": 15195134148480.0, + "grad_norm": 3.246791396021892, + "language_loss": 0.68356919, + "learning_rate": 2.657340955041487e-08, + "loss": 0.70463997, + "num_input_tokens_seen": 170637620, + "step": 7897, + "time_per_iteration": 2.6020145416259766 + }, + { + "auxiliary_loss_clip": 0.01138134, + "auxiliary_loss_mlp": 0.01026416, + "balance_loss_clip": 1.04704404, + "balance_loss_mlp": 1.01871181, + "epoch": 0.9496783502675404, + "flos": 28616566705920.0, + "grad_norm": 1.8595902935398658, + "language_loss": 0.7189703, + "learning_rate": 2.6446998821611167e-08, + "loss": 0.74061584, + "num_input_tokens_seen": 170657815, + "step": 7898, + "time_per_iteration": 2.5292317867279053 + }, + { + "auxiliary_loss_clip": 0.01109144, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.0407455, + "balance_loss_mlp": 1.01994395, + "epoch": 0.9497985931581795, + "flos": 14866874732160.0, + "grad_norm": 2.51988940484905, + "language_loss": 0.71779197, + "learning_rate": 2.6320887479228228e-08, + "loss": 0.7391541, + "num_input_tokens_seen": 170674415, + "step": 7899, + "time_per_iteration": 2.5008280277252197 + }, + { + "auxiliary_loss_clip": 0.0113983, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.04374695, + "balance_loss_mlp": 1.02104425, + "epoch": 0.9499188360488187, + "flos": 27193319136000.0, + "grad_norm": 2.3891901605304007, + "language_loss": 0.726197, + "learning_rate": 2.619507554239786e-08, + "loss": 0.74787682, + "num_input_tokens_seen": 170692975, + "step": 7900, + "time_per_iteration": 2.5016028881073 + }, + { + "auxiliary_loss_clip": 0.01136778, + "auxiliary_loss_mlp": 0.01027396, + "balance_loss_clip": 1.04434419, + "balance_loss_mlp": 1.02016592, + "epoch": 0.9500390789394577, + "flos": 24316479982080.0, + "grad_norm": 1.7727170261963179, + "language_loss": 0.6947031, + "learning_rate": 2.606956303020502e-08, + "loss": 0.71634483, + "num_input_tokens_seen": 170713780, + "step": 7901, + "time_per_iteration": 2.5260939598083496 + }, + { + "auxiliary_loss_clip": 0.0115266, + "auxiliary_loss_mlp": 0.01022778, + "balance_loss_clip": 1.04780269, + "balance_loss_mlp": 1.01538396, + "epoch": 0.9501593218300968, + "flos": 14354752573440.0, + "grad_norm": 1.7317561122193883, + "language_loss": 0.8402015, + "learning_rate": 2.5944349961690036e-08, + "loss": 0.86195588, + "num_input_tokens_seen": 170730800, + "step": 7902, + "time_per_iteration": 2.4129440784454346 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01021667, + "balance_loss_clip": 1.04260767, + "balance_loss_mlp": 1.01452649, + "epoch": 0.9502795647207359, + "flos": 38728113742080.0, + "grad_norm": 1.5990768469515926, + "language_loss": 0.73021787, + "learning_rate": 2.581943635584749e-08, + "loss": 0.75164902, + "num_input_tokens_seen": 170753630, + "step": 7903, + "time_per_iteration": 2.6594784259796143 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01019697, + "balance_loss_clip": 1.04357338, + "balance_loss_mlp": 1.01336074, + "epoch": 0.950399807611375, + "flos": 40808023799040.0, + "grad_norm": 1.510223638928864, + "language_loss": 0.65172625, + "learning_rate": 2.569482223162689e-08, + "loss": 0.67321008, + "num_input_tokens_seen": 170777605, + "step": 7904, + "time_per_iteration": 2.660789966583252 + }, + { + "auxiliary_loss_clip": 0.01150816, + "auxiliary_loss_mlp": 0.01021235, + "balance_loss_clip": 1.04420209, + "balance_loss_mlp": 1.01404631, + "epoch": 0.950520050502014, + "flos": 23440403266560.0, + "grad_norm": 1.7361663881273115, + "language_loss": 0.72268748, + "learning_rate": 2.5570507607932e-08, + "loss": 0.74440795, + "num_input_tokens_seen": 170797520, + "step": 7905, + "time_per_iteration": 2.4628560543060303 + }, + { + "auxiliary_loss_clip": 0.01155819, + "auxiliary_loss_mlp": 0.01025197, + "balance_loss_clip": 1.04578066, + "balance_loss_mlp": 1.01793456, + "epoch": 0.9506402933926532, + "flos": 17783718658560.0, + "grad_norm": 3.220973105537502, + "language_loss": 0.63576317, + "learning_rate": 2.54464925036213e-08, + "loss": 0.65757334, + "num_input_tokens_seen": 170814810, + "step": 7906, + "time_per_iteration": 2.4090638160705566 + }, + { + "auxiliary_loss_clip": 0.01149802, + "auxiliary_loss_mlp": 0.0102232, + "balance_loss_clip": 1.04518294, + "balance_loss_mlp": 1.01466048, + "epoch": 0.9507605362832923, + "flos": 32561928668160.0, + "grad_norm": 1.8310937891401635, + "language_loss": 0.60720944, + "learning_rate": 2.532277693750773e-08, + "loss": 0.62893069, + "num_input_tokens_seen": 170835735, + "step": 7907, + "time_per_iteration": 3.38250470161438 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.0102353, + "balance_loss_clip": 1.04533446, + "balance_loss_mlp": 1.01631212, + "epoch": 0.9508807791739313, + "flos": 19602054898560.0, + "grad_norm": 2.0829247431788755, + "language_loss": 0.76211607, + "learning_rate": 2.5199360928358948e-08, + "loss": 0.78344727, + "num_input_tokens_seen": 170852970, + "step": 7908, + "time_per_iteration": 3.339094400405884 + }, + { + "auxiliary_loss_clip": 0.01139686, + "auxiliary_loss_mlp": 0.00761169, + "balance_loss_clip": 1.0419178, + "balance_loss_mlp": 1.00043821, + "epoch": 0.9510010220645704, + "flos": 21471852349440.0, + "grad_norm": 1.778389177432278, + "language_loss": 0.86937213, + "learning_rate": 2.507624449489665e-08, + "loss": 0.88838065, + "num_input_tokens_seen": 170871600, + "step": 7909, + "time_per_iteration": 3.2919118404388428 + }, + { + "auxiliary_loss_clip": 0.01137541, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.04638052, + "balance_loss_mlp": 1.01918125, + "epoch": 0.9511212649552095, + "flos": 18879999701760.0, + "grad_norm": 1.8383305851584208, + "language_loss": 0.65043491, + "learning_rate": 2.495342765579811e-08, + "loss": 0.67207533, + "num_input_tokens_seen": 170890260, + "step": 7910, + "time_per_iteration": 2.4687838554382324 + }, + { + "auxiliary_loss_clip": 0.01108182, + "auxiliary_loss_mlp": 0.01020726, + "balance_loss_clip": 1.04540312, + "balance_loss_mlp": 1.01413095, + "epoch": 0.9512415078458486, + "flos": 20810521094400.0, + "grad_norm": 1.7067830872009233, + "language_loss": 0.70883018, + "learning_rate": 2.4830910429693984e-08, + "loss": 0.73011923, + "num_input_tokens_seen": 170910220, + "step": 7911, + "time_per_iteration": 2.5718703269958496 + }, + { + "auxiliary_loss_clip": 0.01163859, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.04575384, + "balance_loss_mlp": 1.02097416, + "epoch": 0.9513617507364877, + "flos": 18369565482240.0, + "grad_norm": 2.249737197616849, + "language_loss": 0.79921162, + "learning_rate": 2.470869283517052e-08, + "loss": 0.82113266, + "num_input_tokens_seen": 170928255, + "step": 7912, + "time_per_iteration": 2.3896517753601074 + }, + { + "auxiliary_loss_clip": 0.01143383, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.04266667, + "balance_loss_mlp": 1.0173403, + "epoch": 0.9514819936271268, + "flos": 25010166412800.0, + "grad_norm": 1.5636190551080271, + "language_loss": 0.76957893, + "learning_rate": 2.458677489076777e-08, + "loss": 0.79125828, + "num_input_tokens_seen": 170949265, + "step": 7913, + "time_per_iteration": 2.507042407989502 + }, + { + "auxiliary_loss_clip": 0.01140404, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.04278743, + "balance_loss_mlp": 1.02054405, + "epoch": 0.9516022365177659, + "flos": 18662129758080.0, + "grad_norm": 1.8812950454134774, + "language_loss": 0.82991415, + "learning_rate": 2.446515661498072e-08, + "loss": 0.85159069, + "num_input_tokens_seen": 170968595, + "step": 7914, + "time_per_iteration": 2.4373035430908203 + }, + { + "auxiliary_loss_clip": 0.01093446, + "auxiliary_loss_mlp": 0.01026178, + "balance_loss_clip": 1.04129684, + "balance_loss_mlp": 1.01921654, + "epoch": 0.9517224794084049, + "flos": 25372109808000.0, + "grad_norm": 3.3668795184327522, + "language_loss": 0.74233401, + "learning_rate": 2.434383802625861e-08, + "loss": 0.76353025, + "num_input_tokens_seen": 170987550, + "step": 7915, + "time_per_iteration": 2.6068801879882812 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01024353, + "balance_loss_clip": 1.03996992, + "balance_loss_mlp": 1.01738262, + "epoch": 0.9518427222990441, + "flos": 21470918595840.0, + "grad_norm": 1.7502016370595734, + "language_loss": 0.73787653, + "learning_rate": 2.4222819143005168e-08, + "loss": 0.75933814, + "num_input_tokens_seen": 171007145, + "step": 7916, + "time_per_iteration": 3.2857820987701416 + }, + { + "auxiliary_loss_clip": 0.01162166, + "auxiliary_loss_mlp": 0.01023244, + "balance_loss_clip": 1.04657066, + "balance_loss_mlp": 1.01639533, + "epoch": 0.9519629651896832, + "flos": 21033634423680.0, + "grad_norm": 4.0626144362388175, + "language_loss": 0.81006205, + "learning_rate": 2.4102099983579706e-08, + "loss": 0.83191621, + "num_input_tokens_seen": 171026295, + "step": 7917, + "time_per_iteration": 2.430516481399536 + }, + { + "auxiliary_loss_clip": 0.01149456, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.04395151, + "balance_loss_mlp": 1.01835704, + "epoch": 0.9520832080803222, + "flos": 21689219502720.0, + "grad_norm": 1.8973443977570357, + "language_loss": 0.77114999, + "learning_rate": 2.3981680566294236e-08, + "loss": 0.79290581, + "num_input_tokens_seen": 171045895, + "step": 7918, + "time_per_iteration": 2.4527502059936523 + }, + { + "auxiliary_loss_clip": 0.01163303, + "auxiliary_loss_mlp": 0.01025004, + "balance_loss_clip": 1.04807699, + "balance_loss_mlp": 1.01848936, + "epoch": 0.9522034509709614, + "flos": 23145289125120.0, + "grad_norm": 1.812190183045559, + "language_loss": 0.73185623, + "learning_rate": 2.3861560909416822e-08, + "loss": 0.75373924, + "num_input_tokens_seen": 171065445, + "step": 7919, + "time_per_iteration": 2.4244539737701416 + }, + { + "auxiliary_loss_clip": 0.01111961, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.04474926, + "balance_loss_mlp": 1.01994562, + "epoch": 0.9523236938616004, + "flos": 24679428958080.0, + "grad_norm": 1.6848474862599423, + "language_loss": 0.82475185, + "learning_rate": 2.3741741031169325e-08, + "loss": 0.84613848, + "num_input_tokens_seen": 171085015, + "step": 7920, + "time_per_iteration": 2.5710854530334473 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.03927994, + "balance_loss_mlp": 1.02136779, + "epoch": 0.9524439367522395, + "flos": 22672309812480.0, + "grad_norm": 1.8287716841394348, + "language_loss": 0.71649098, + "learning_rate": 2.3622220949728544e-08, + "loss": 0.73780882, + "num_input_tokens_seen": 171103900, + "step": 7921, + "time_per_iteration": 2.547158718109131 + }, + { + "auxiliary_loss_clip": 0.0114276, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.04337382, + "balance_loss_mlp": 1.01993132, + "epoch": 0.9525641796428787, + "flos": 34055525024640.0, + "grad_norm": 2.8791901054580578, + "language_loss": 0.61022198, + "learning_rate": 2.3503000683225526e-08, + "loss": 0.63192499, + "num_input_tokens_seen": 171121615, + "step": 7922, + "time_per_iteration": 2.545583963394165 + }, + { + "auxiliary_loss_clip": 0.01165562, + "auxiliary_loss_mlp": 0.01024035, + "balance_loss_clip": 1.04637206, + "balance_loss_mlp": 1.01675129, + "epoch": 0.9526844225335177, + "flos": 16727083251840.0, + "grad_norm": 2.132282022322332, + "language_loss": 0.84494841, + "learning_rate": 2.3384080249745585e-08, + "loss": 0.86684436, + "num_input_tokens_seen": 171139505, + "step": 7923, + "time_per_iteration": 2.4152626991271973 + }, + { + "auxiliary_loss_clip": 0.01111939, + "auxiliary_loss_mlp": 0.01024284, + "balance_loss_clip": 1.04118335, + "balance_loss_mlp": 1.01775384, + "epoch": 0.9528046654241568, + "flos": 36939367330560.0, + "grad_norm": 2.3650445255666943, + "language_loss": 0.82840222, + "learning_rate": 2.3265459667329178e-08, + "loss": 0.84976447, + "num_input_tokens_seen": 171158995, + "step": 7924, + "time_per_iteration": 2.6985020637512207 + }, + { + "auxiliary_loss_clip": 0.01137995, + "auxiliary_loss_mlp": 0.01019972, + "balance_loss_clip": 1.04471958, + "balance_loss_mlp": 1.0129447, + "epoch": 0.9529249083147959, + "flos": 18255010032000.0, + "grad_norm": 2.0093128489894374, + "language_loss": 0.85960305, + "learning_rate": 2.31471389539708e-08, + "loss": 0.88118267, + "num_input_tokens_seen": 171176120, + "step": 7925, + "time_per_iteration": 2.4927144050598145 + }, + { + "auxiliary_loss_clip": 0.01151615, + "auxiliary_loss_mlp": 0.00760929, + "balance_loss_clip": 1.0464139, + "balance_loss_mlp": 1.0004046, + "epoch": 0.953045151205435, + "flos": 28658438985600.0, + "grad_norm": 2.239207463529204, + "language_loss": 0.7294575, + "learning_rate": 2.3029118127619872e-08, + "loss": 0.74858296, + "num_input_tokens_seen": 171195835, + "step": 7926, + "time_per_iteration": 2.5613765716552734 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01022349, + "balance_loss_clip": 1.04283845, + "balance_loss_mlp": 1.01494622, + "epoch": 0.953165394096074, + "flos": 21835232288640.0, + "grad_norm": 2.4410223684445755, + "language_loss": 0.87269485, + "learning_rate": 2.2911397206179628e-08, + "loss": 0.89420676, + "num_input_tokens_seen": 171212585, + "step": 7927, + "time_per_iteration": 2.4757931232452393 + }, + { + "auxiliary_loss_clip": 0.01161736, + "auxiliary_loss_mlp": 0.01024627, + "balance_loss_clip": 1.04604971, + "balance_loss_mlp": 1.01794589, + "epoch": 0.9532856369867132, + "flos": 19975059682560.0, + "grad_norm": 6.581115793409175, + "language_loss": 0.62622267, + "learning_rate": 2.279397620750845e-08, + "loss": 0.64808631, + "num_input_tokens_seen": 171231630, + "step": 7928, + "time_per_iteration": 2.4068655967712402 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01024837, + "balance_loss_clip": 1.04270196, + "balance_loss_mlp": 1.01843214, + "epoch": 0.9534058798773523, + "flos": 15049588239360.0, + "grad_norm": 2.1314840585865342, + "language_loss": 0.78899407, + "learning_rate": 2.2676855149419195e-08, + "loss": 0.81057721, + "num_input_tokens_seen": 171248800, + "step": 7929, + "time_per_iteration": 2.44342303276062 + }, + { + "auxiliary_loss_clip": 0.0113582, + "auxiliary_loss_mlp": 0.0102509, + "balance_loss_clip": 1.04875267, + "balance_loss_mlp": 1.01795495, + "epoch": 0.9535261227679913, + "flos": 17602800831360.0, + "grad_norm": 2.6539089806052143, + "language_loss": 0.75727117, + "learning_rate": 2.2560034049678988e-08, + "loss": 0.77888036, + "num_input_tokens_seen": 171263150, + "step": 7930, + "time_per_iteration": 2.443641424179077 + }, + { + "auxiliary_loss_clip": 0.01169942, + "auxiliary_loss_mlp": 0.01024312, + "balance_loss_clip": 1.0495379, + "balance_loss_mlp": 1.0171833, + "epoch": 0.9536463656586305, + "flos": 23142954741120.0, + "grad_norm": 1.780541748263086, + "language_loss": 0.75321794, + "learning_rate": 2.2443512926008988e-08, + "loss": 0.77516055, + "num_input_tokens_seen": 171282480, + "step": 7931, + "time_per_iteration": 2.4090731143951416 + }, + { + "auxiliary_loss_clip": 0.01123183, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.04190397, + "balance_loss_mlp": 1.01912355, + "epoch": 0.9537666085492695, + "flos": 18625033987200.0, + "grad_norm": 2.6809850364366263, + "language_loss": 0.69789481, + "learning_rate": 2.2327291796085946e-08, + "loss": 0.71938813, + "num_input_tokens_seen": 171300840, + "step": 7932, + "time_per_iteration": 2.4717390537261963 + }, + { + "auxiliary_loss_clip": 0.01164471, + "auxiliary_loss_mlp": 0.01025609, + "balance_loss_clip": 1.0455513, + "balance_loss_mlp": 1.01853418, + "epoch": 0.9538868514399086, + "flos": 18989347680000.0, + "grad_norm": 3.124842473277786, + "language_loss": 0.77398485, + "learning_rate": 2.2211370677540197e-08, + "loss": 0.79588568, + "num_input_tokens_seen": 171317365, + "step": 7933, + "time_per_iteration": 3.242696762084961 + }, + { + "auxiliary_loss_clip": 0.0116536, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.0467689, + "balance_loss_mlp": 1.01891732, + "epoch": 0.9540070943305478, + "flos": 16800556521600.0, + "grad_norm": 3.348007318913691, + "language_loss": 0.7835173, + "learning_rate": 2.2095749587957012e-08, + "loss": 0.8054322, + "num_input_tokens_seen": 171335270, + "step": 7934, + "time_per_iteration": 2.389634609222412 + }, + { + "auxiliary_loss_clip": 0.01131952, + "auxiliary_loss_mlp": 0.01026264, + "balance_loss_clip": 1.04043746, + "balance_loss_mlp": 1.01871836, + "epoch": 0.9541273372211868, + "flos": 20156911263360.0, + "grad_norm": 2.830768016858018, + "language_loss": 0.69362223, + "learning_rate": 2.1980428544876138e-08, + "loss": 0.71520436, + "num_input_tokens_seen": 171353910, + "step": 7935, + "time_per_iteration": 3.239562511444092 + }, + { + "auxiliary_loss_clip": 0.01100154, + "auxiliary_loss_mlp": 0.01024546, + "balance_loss_clip": 1.03607368, + "balance_loss_mlp": 1.01690483, + "epoch": 0.9542475801118259, + "flos": 26725511381760.0, + "grad_norm": 1.5325552328243468, + "language_loss": 0.73867846, + "learning_rate": 2.1865407565791584e-08, + "loss": 0.75992548, + "num_input_tokens_seen": 171375480, + "step": 7936, + "time_per_iteration": 3.4552974700927734 + }, + { + "auxiliary_loss_clip": 0.01135908, + "auxiliary_loss_mlp": 0.01022589, + "balance_loss_clip": 1.0413959, + "balance_loss_mlp": 1.01513267, + "epoch": 0.954367823002465, + "flos": 23330911633920.0, + "grad_norm": 6.233755643949018, + "language_loss": 0.77534211, + "learning_rate": 2.175068666815183e-08, + "loss": 0.79692709, + "num_input_tokens_seen": 171396320, + "step": 7937, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01124115, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.04264879, + "balance_loss_mlp": 1.02337813, + "epoch": 0.9544880658931041, + "flos": 14902713527040.0, + "grad_norm": 2.1005712454665537, + "language_loss": 0.78787315, + "learning_rate": 2.163626586935985e-08, + "loss": 0.80942065, + "num_input_tokens_seen": 171412860, + "step": 7938, + "time_per_iteration": 2.511671781539917 + }, + { + "auxiliary_loss_clip": 0.01146452, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.04323411, + "balance_loss_mlp": 1.02118242, + "epoch": 0.9546083087837431, + "flos": 29095902725760.0, + "grad_norm": 2.280249562238388, + "language_loss": 0.63033187, + "learning_rate": 2.1522145186773755e-08, + "loss": 0.65208197, + "num_input_tokens_seen": 171431780, + "step": 7939, + "time_per_iteration": 2.5599637031555176 + }, + { + "auxiliary_loss_clip": 0.0113347, + "auxiliary_loss_mlp": 0.01025053, + "balance_loss_clip": 1.04318857, + "balance_loss_mlp": 1.01823378, + "epoch": 0.9547285516743822, + "flos": 21142335957120.0, + "grad_norm": 2.3914949473883462, + "language_loss": 0.85148418, + "learning_rate": 2.140832463770481e-08, + "loss": 0.87306941, + "num_input_tokens_seen": 171450975, + "step": 7940, + "time_per_iteration": 2.4612669944763184 + }, + { + "auxiliary_loss_clip": 0.0113996, + "auxiliary_loss_mlp": 0.01021349, + "balance_loss_clip": 1.04350042, + "balance_loss_mlp": 1.01423788, + "epoch": 0.9548487945650214, + "flos": 27490157130240.0, + "grad_norm": 2.594106354653213, + "language_loss": 0.75912237, + "learning_rate": 2.129480423941987e-08, + "loss": 0.78073543, + "num_input_tokens_seen": 171467645, + "step": 7941, + "time_per_iteration": 2.5190694332122803 + }, + { + "auxiliary_loss_clip": 0.01138088, + "auxiliary_loss_mlp": 0.01022245, + "balance_loss_clip": 1.04352808, + "balance_loss_mlp": 1.01566744, + "epoch": 0.9549690374556604, + "flos": 22273198819200.0, + "grad_norm": 1.628924507148828, + "language_loss": 0.80196249, + "learning_rate": 2.1181584009140052e-08, + "loss": 0.82356584, + "num_input_tokens_seen": 171487185, + "step": 7942, + "time_per_iteration": 3.177783250808716 + }, + { + "auxiliary_loss_clip": 0.01131414, + "auxiliary_loss_mlp": 0.01025015, + "balance_loss_clip": 1.04435813, + "balance_loss_mlp": 1.01856613, + "epoch": 0.9550892803462995, + "flos": 17595294888960.0, + "grad_norm": 2.2168011145549005, + "language_loss": 0.83744979, + "learning_rate": 2.10686639640405e-08, + "loss": 0.85901415, + "num_input_tokens_seen": 171501275, + "step": 7943, + "time_per_iteration": 2.4626402854919434 + }, + { + "auxiliary_loss_clip": 0.01153053, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.04559457, + "balance_loss_mlp": 1.01724911, + "epoch": 0.9552095232369386, + "flos": 24353144789760.0, + "grad_norm": 1.7419552880218059, + "language_loss": 0.81155145, + "learning_rate": 2.0956044121251294e-08, + "loss": 0.83332855, + "num_input_tokens_seen": 171520060, + "step": 7944, + "time_per_iteration": 2.489194393157959 + }, + { + "auxiliary_loss_clip": 0.01123822, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.04558134, + "balance_loss_mlp": 1.01712394, + "epoch": 0.9553297661275777, + "flos": 22746860490240.0, + "grad_norm": 1.7479338731445038, + "language_loss": 0.81214035, + "learning_rate": 2.084372449785654e-08, + "loss": 0.83362329, + "num_input_tokens_seen": 171539895, + "step": 7945, + "time_per_iteration": 2.5234341621398926 + }, + { + "auxiliary_loss_clip": 0.01131955, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.04187047, + "balance_loss_mlp": 1.01883042, + "epoch": 0.9554500090182168, + "flos": 15413866018560.0, + "grad_norm": 1.7917656836217764, + "language_loss": 0.68747103, + "learning_rate": 2.0731705110895282e-08, + "loss": 0.70904934, + "num_input_tokens_seen": 171557385, + "step": 7946, + "time_per_iteration": 2.4476161003112793 + }, + { + "auxiliary_loss_clip": 0.01155151, + "auxiliary_loss_mlp": 0.01024817, + "balance_loss_clip": 1.0483501, + "balance_loss_mlp": 1.01714623, + "epoch": 0.9555702519088559, + "flos": 23513517400320.0, + "grad_norm": 2.0195917611910446, + "language_loss": 0.86830354, + "learning_rate": 2.0619985977360587e-08, + "loss": 0.89010322, + "num_input_tokens_seen": 171575705, + "step": 7947, + "time_per_iteration": 2.4610509872436523 + }, + { + "auxiliary_loss_clip": 0.01107197, + "auxiliary_loss_mlp": 0.0102604, + "balance_loss_clip": 1.03770041, + "balance_loss_mlp": 1.01914978, + "epoch": 0.955690494799495, + "flos": 22962072827520.0, + "grad_norm": 1.853082052047393, + "language_loss": 0.76820534, + "learning_rate": 2.0508567114200237e-08, + "loss": 0.78953773, + "num_input_tokens_seen": 171595620, + "step": 7948, + "time_per_iteration": 2.5383851528167725 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01022111, + "balance_loss_clip": 1.04455066, + "balance_loss_mlp": 1.0154798, + "epoch": 0.955810737690134, + "flos": 26031250333440.0, + "grad_norm": 3.7066386509107794, + "language_loss": 0.78812921, + "learning_rate": 2.0397448538316485e-08, + "loss": 0.80974424, + "num_input_tokens_seen": 171616660, + "step": 7949, + "time_per_iteration": 2.549020290374756 + }, + { + "auxiliary_loss_clip": 0.01117263, + "auxiliary_loss_mlp": 0.01023825, + "balance_loss_clip": 1.04193544, + "balance_loss_mlp": 1.01698875, + "epoch": 0.9559309805807732, + "flos": 20849951249280.0, + "grad_norm": 2.79569559823281, + "language_loss": 0.66580039, + "learning_rate": 2.028663026656563e-08, + "loss": 0.68721128, + "num_input_tokens_seen": 171635515, + "step": 7950, + "time_per_iteration": 2.505826473236084 + }, + { + "auxiliary_loss_clip": 0.01162852, + "auxiliary_loss_mlp": 0.00762192, + "balance_loss_clip": 1.04665995, + "balance_loss_mlp": 1.00042915, + "epoch": 0.9560512234714122, + "flos": 21578219498880.0, + "grad_norm": 1.9237452955086372, + "language_loss": 0.71839231, + "learning_rate": 2.0176112315758885e-08, + "loss": 0.73764277, + "num_input_tokens_seen": 171653305, + "step": 7951, + "time_per_iteration": 2.4165947437286377 + }, + { + "auxiliary_loss_clip": 0.01116675, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.04335201, + "balance_loss_mlp": 1.02158952, + "epoch": 0.9561714663620513, + "flos": 17450144029440.0, + "grad_norm": 3.264957688021787, + "language_loss": 0.69134521, + "learning_rate": 2.0065894702661957e-08, + "loss": 0.71280074, + "num_input_tokens_seen": 171669980, + "step": 7952, + "time_per_iteration": 2.504631519317627 + }, + { + "auxiliary_loss_clip": 0.01115792, + "auxiliary_loss_mlp": 0.00762011, + "balance_loss_clip": 1.04038048, + "balance_loss_mlp": 1.00043249, + "epoch": 0.9562917092526905, + "flos": 26098510550400.0, + "grad_norm": 1.9200275155092934, + "language_loss": 0.7782023, + "learning_rate": 1.9955977443994577e-08, + "loss": 0.79698032, + "num_input_tokens_seen": 171689970, + "step": 7953, + "time_per_iteration": 2.5528924465179443 + }, + { + "auxiliary_loss_clip": 0.01139017, + "auxiliary_loss_mlp": 0.0102832, + "balance_loss_clip": 1.04606366, + "balance_loss_mlp": 1.02061868, + "epoch": 0.9564119521433295, + "flos": 24096742531200.0, + "grad_norm": 1.9844726399692598, + "language_loss": 0.61818546, + "learning_rate": 1.9846360556430965e-08, + "loss": 0.63985884, + "num_input_tokens_seen": 171708270, + "step": 7954, + "time_per_iteration": 2.486854076385498 + }, + { + "auxiliary_loss_clip": 0.01161706, + "auxiliary_loss_mlp": 0.01020885, + "balance_loss_clip": 1.0447855, + "balance_loss_mlp": 1.01412845, + "epoch": 0.9565321950339686, + "flos": 32008903896960.0, + "grad_norm": 2.1461605526826943, + "language_loss": 0.61596388, + "learning_rate": 1.973704405660004e-08, + "loss": 0.63778973, + "num_input_tokens_seen": 171729385, + "step": 7955, + "time_per_iteration": 2.4807586669921875 + }, + { + "auxiliary_loss_clip": 0.0109332, + "auxiliary_loss_mlp": 0.01024786, + "balance_loss_clip": 1.04128313, + "balance_loss_mlp": 1.01813412, + "epoch": 0.9566524379246077, + "flos": 23588642695680.0, + "grad_norm": 1.4773026871911497, + "language_loss": 0.78230816, + "learning_rate": 1.9628027961085203e-08, + "loss": 0.80348927, + "num_input_tokens_seen": 171752615, + "step": 7956, + "time_per_iteration": 2.6084582805633545 + }, + { + "auxiliary_loss_clip": 0.01109845, + "auxiliary_loss_mlp": 0.01021198, + "balance_loss_clip": 1.03786778, + "balance_loss_mlp": 1.01438785, + "epoch": 0.9567726808152468, + "flos": 38067716240640.0, + "grad_norm": 1.7272869933097705, + "language_loss": 0.83818829, + "learning_rate": 1.9519312286423894e-08, + "loss": 0.85949874, + "num_input_tokens_seen": 171775810, + "step": 7957, + "time_per_iteration": 2.6442482471466064 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01020035, + "balance_loss_clip": 1.04619336, + "balance_loss_mlp": 1.0128262, + "epoch": 0.9568929237058859, + "flos": 22744059229440.0, + "grad_norm": 2.0668731747494506, + "language_loss": 0.77823257, + "learning_rate": 1.9410897049108255e-08, + "loss": 0.79991215, + "num_input_tokens_seen": 171795090, + "step": 7958, + "time_per_iteration": 2.4371297359466553 + }, + { + "auxiliary_loss_clip": 0.01172366, + "auxiliary_loss_mlp": 0.01024694, + "balance_loss_clip": 1.05144978, + "balance_loss_mlp": 1.01723456, + "epoch": 0.957013166596525, + "flos": 23841633162240.0, + "grad_norm": 2.1492120647910298, + "language_loss": 0.91172862, + "learning_rate": 1.9302782265584905e-08, + "loss": 0.93369925, + "num_input_tokens_seen": 171815755, + "step": 7959, + "time_per_iteration": 3.2843074798583984 + }, + { + "auxiliary_loss_clip": 0.01098423, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.04239273, + "balance_loss_mlp": 1.01895714, + "epoch": 0.9571334094871641, + "flos": 17639286071040.0, + "grad_norm": 2.1963974297954882, + "language_loss": 0.87164724, + "learning_rate": 1.9194967952254282e-08, + "loss": 0.89289337, + "num_input_tokens_seen": 171834330, + "step": 7960, + "time_per_iteration": 2.5030741691589355 + }, + { + "auxiliary_loss_clip": 0.0115075, + "auxiliary_loss_mlp": 0.0102257, + "balance_loss_clip": 1.04653263, + "balance_loss_mlp": 1.01517904, + "epoch": 0.9572536523778031, + "flos": 15369623441280.0, + "grad_norm": 2.1855534926130233, + "language_loss": 0.81008863, + "learning_rate": 1.9087454125472635e-08, + "loss": 0.83182186, + "num_input_tokens_seen": 171848805, + "step": 7961, + "time_per_iteration": 2.4061107635498047 + }, + { + "auxiliary_loss_clip": 0.01164972, + "auxiliary_loss_mlp": 0.01021859, + "balance_loss_clip": 1.04690921, + "balance_loss_mlp": 1.01451898, + "epoch": 0.9573738952684423, + "flos": 24969838417920.0, + "grad_norm": 2.024653760470153, + "language_loss": 0.78619277, + "learning_rate": 1.8980240801548696e-08, + "loss": 0.80806106, + "num_input_tokens_seen": 171867995, + "step": 7962, + "time_per_iteration": 3.1779263019561768 + }, + { + "auxiliary_loss_clip": 0.01135594, + "auxiliary_loss_mlp": 0.01022001, + "balance_loss_clip": 1.04747093, + "balance_loss_mlp": 1.01507747, + "epoch": 0.9574941381590814, + "flos": 25769461034880.0, + "grad_norm": 1.6299165619662728, + "language_loss": 0.74275267, + "learning_rate": 1.8873327996747458e-08, + "loss": 0.7643286, + "num_input_tokens_seen": 171886495, + "step": 7963, + "time_per_iteration": 3.326136589050293 + }, + { + "auxiliary_loss_clip": 0.01152431, + "auxiliary_loss_mlp": 0.01024561, + "balance_loss_clip": 1.0443151, + "balance_loss_mlp": 1.01774478, + "epoch": 0.9576143810497204, + "flos": 32307178435200.0, + "grad_norm": 1.9668565514878975, + "language_loss": 0.65918481, + "learning_rate": 1.8766715727287053e-08, + "loss": 0.68095475, + "num_input_tokens_seen": 171908200, + "step": 7964, + "time_per_iteration": 2.525831699371338 + }, + { + "auxiliary_loss_clip": 0.01153872, + "auxiliary_loss_mlp": 0.00761995, + "balance_loss_clip": 1.04485583, + "balance_loss_mlp": 1.00050509, + "epoch": 0.9577346239403596, + "flos": 27745733376000.0, + "grad_norm": 1.5841434800486671, + "language_loss": 0.79479587, + "learning_rate": 1.8660404009340546e-08, + "loss": 0.81395447, + "num_input_tokens_seen": 171928650, + "step": 7965, + "time_per_iteration": 2.4738192558288574 + }, + { + "auxiliary_loss_clip": 0.01052641, + "auxiliary_loss_mlp": 0.01001325, + "balance_loss_clip": 1.00757861, + "balance_loss_mlp": 1.00046718, + "epoch": 0.9578548668309986, + "flos": 57468313710720.0, + "grad_norm": 0.8629714234026357, + "language_loss": 0.59536701, + "learning_rate": 1.8554392859035485e-08, + "loss": 0.61590672, + "num_input_tokens_seen": 171986400, + "step": 7966, + "time_per_iteration": 3.0143535137176514 + }, + { + "auxiliary_loss_clip": 0.01082606, + "auxiliary_loss_mlp": 0.01023758, + "balance_loss_clip": 1.03738642, + "balance_loss_mlp": 1.01682019, + "epoch": 0.9579751097216377, + "flos": 19756040503680.0, + "grad_norm": 1.7255162415298613, + "language_loss": 0.78941423, + "learning_rate": 1.8448682292453444e-08, + "loss": 0.81047785, + "num_input_tokens_seen": 172005475, + "step": 7967, + "time_per_iteration": 2.575777530670166 + }, + { + "auxiliary_loss_clip": 0.011644, + "auxiliary_loss_mlp": 0.00761294, + "balance_loss_clip": 1.04726803, + "balance_loss_mlp": 1.00043714, + "epoch": 0.9580953526122769, + "flos": 18041270152320.0, + "grad_norm": 1.9599523951132716, + "language_loss": 0.65948665, + "learning_rate": 1.8343272325631154e-08, + "loss": 0.6787436, + "num_input_tokens_seen": 172024420, + "step": 7968, + "time_per_iteration": 2.3758771419525146 + }, + { + "auxiliary_loss_clip": 0.01085485, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.03934121, + "balance_loss_mlp": 1.02048755, + "epoch": 0.9582155955029159, + "flos": 24270154416000.0, + "grad_norm": 2.2049259503429677, + "language_loss": 0.77987635, + "learning_rate": 1.8238162974558492e-08, + "loss": 0.80101335, + "num_input_tokens_seen": 172038350, + "step": 7969, + "time_per_iteration": 3.3561575412750244 + }, + { + "auxiliary_loss_clip": 0.01134892, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.04508424, + "balance_loss_mlp": 1.02177644, + "epoch": 0.958335838393555, + "flos": 22783309816320.0, + "grad_norm": 2.2488678942268163, + "language_loss": 0.74685216, + "learning_rate": 1.8133354255181144e-08, + "loss": 0.76849198, + "num_input_tokens_seen": 172058665, + "step": 7970, + "time_per_iteration": 2.4849183559417725 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01025063, + "balance_loss_clip": 1.04262257, + "balance_loss_mlp": 1.01827741, + "epoch": 0.958456081284194, + "flos": 16911484698240.0, + "grad_norm": 2.037698247234785, + "language_loss": 0.74285889, + "learning_rate": 1.802884618339795e-08, + "loss": 0.76454711, + "num_input_tokens_seen": 172077470, + "step": 7971, + "time_per_iteration": 2.4473037719726562 + }, + { + "auxiliary_loss_clip": 0.01152308, + "auxiliary_loss_mlp": 0.01022579, + "balance_loss_clip": 1.04821038, + "balance_loss_mlp": 1.01518214, + "epoch": 0.9585763241748332, + "flos": 19974951941760.0, + "grad_norm": 2.000170344588468, + "language_loss": 0.81331193, + "learning_rate": 1.7924638775062894e-08, + "loss": 0.83506083, + "num_input_tokens_seen": 172096590, + "step": 7972, + "time_per_iteration": 2.4401140213012695 + }, + { + "auxiliary_loss_clip": 0.01118317, + "auxiliary_loss_mlp": 0.01025462, + "balance_loss_clip": 1.04457879, + "balance_loss_mlp": 1.01850927, + "epoch": 0.9586965670654722, + "flos": 21395649646080.0, + "grad_norm": 1.967503584092124, + "language_loss": 0.81185436, + "learning_rate": 1.7820732045984444e-08, + "loss": 0.83329213, + "num_input_tokens_seen": 172116735, + "step": 7973, + "time_per_iteration": 2.5631072521209717 + }, + { + "auxiliary_loss_clip": 0.01148353, + "auxiliary_loss_mlp": 0.01024575, + "balance_loss_clip": 1.04366922, + "balance_loss_mlp": 1.01722538, + "epoch": 0.9588168099561113, + "flos": 21435115714560.0, + "grad_norm": 1.712006301599308, + "language_loss": 0.74138802, + "learning_rate": 1.7717126011924655e-08, + "loss": 0.76311725, + "num_input_tokens_seen": 172138320, + "step": 7974, + "time_per_iteration": 2.505289316177368 + }, + { + "auxiliary_loss_clip": 0.01101998, + "auxiliary_loss_mlp": 0.01024133, + "balance_loss_clip": 1.0366025, + "balance_loss_mlp": 1.01701307, + "epoch": 0.9589370528467505, + "flos": 11763761852160.0, + "grad_norm": 2.674116235276764, + "language_loss": 0.76672012, + "learning_rate": 1.7613820688600957e-08, + "loss": 0.78798145, + "num_input_tokens_seen": 172154225, + "step": 7975, + "time_per_iteration": 2.5380561351776123 + }, + { + "auxiliary_loss_clip": 0.01142381, + "auxiliary_loss_mlp": 0.01024792, + "balance_loss_clip": 1.0422163, + "balance_loss_mlp": 1.01819932, + "epoch": 0.9590572957373895, + "flos": 23441516588160.0, + "grad_norm": 1.8174654296468669, + "language_loss": 0.7863487, + "learning_rate": 1.7510816091684588e-08, + "loss": 0.80802035, + "num_input_tokens_seen": 172174150, + "step": 7976, + "time_per_iteration": 2.4980061054229736 + }, + { + "auxiliary_loss_clip": 0.01138007, + "auxiliary_loss_mlp": 0.01026886, + "balance_loss_clip": 1.04491901, + "balance_loss_mlp": 1.01980507, + "epoch": 0.9591775386280286, + "flos": 22528272274560.0, + "grad_norm": 2.996606761601389, + "language_loss": 0.78698194, + "learning_rate": 1.740811223680083e-08, + "loss": 0.80863088, + "num_input_tokens_seen": 172191005, + "step": 7977, + "time_per_iteration": 2.467432737350464 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.04647398, + "balance_loss_mlp": 1.018013, + "epoch": 0.9592977815186677, + "flos": 18186959715840.0, + "grad_norm": 2.3649816352065645, + "language_loss": 0.7385416, + "learning_rate": 1.7305709139530334e-08, + "loss": 0.7604388, + "num_input_tokens_seen": 172209785, + "step": 7978, + "time_per_iteration": 2.3876655101776123 + }, + { + "auxiliary_loss_clip": 0.01143611, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.04281366, + "balance_loss_mlp": 1.01604724, + "epoch": 0.9594180244093068, + "flos": 16537797555840.0, + "grad_norm": 2.297310040866127, + "language_loss": 0.74602473, + "learning_rate": 1.7203606815407334e-08, + "loss": 0.76769316, + "num_input_tokens_seen": 172224380, + "step": 7979, + "time_per_iteration": 2.3791959285736084 + }, + { + "auxiliary_loss_clip": 0.01143854, + "auxiliary_loss_mlp": 0.0102526, + "balance_loss_clip": 1.04835653, + "balance_loss_mlp": 1.0182147, + "epoch": 0.9595382672999458, + "flos": 20554334317440.0, + "grad_norm": 1.6140123118052505, + "language_loss": 0.79366875, + "learning_rate": 1.7101805279920557e-08, + "loss": 0.81535983, + "num_input_tokens_seen": 172242540, + "step": 7980, + "time_per_iteration": 2.459242105484009 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01021678, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.01416826, + "epoch": 0.959658510190585, + "flos": 22638266697600.0, + "grad_norm": 2.1376761115108267, + "language_loss": 0.80899394, + "learning_rate": 1.7000304548513643e-08, + "loss": 0.8308568, + "num_input_tokens_seen": 172262645, + "step": 7981, + "time_per_iteration": 2.4031434059143066 + }, + { + "auxiliary_loss_clip": 0.0111909, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.04051685, + "balance_loss_mlp": 1.0180366, + "epoch": 0.9597787530812241, + "flos": 19135252725120.0, + "grad_norm": 2.354070304963403, + "language_loss": 0.82593018, + "learning_rate": 1.6899104636583394e-08, + "loss": 0.84737158, + "num_input_tokens_seen": 172280695, + "step": 7982, + "time_per_iteration": 2.5189952850341797 + }, + { + "auxiliary_loss_clip": 0.01052758, + "auxiliary_loss_mlp": 0.01000838, + "balance_loss_clip": 1.00722671, + "balance_loss_mlp": 1.0000155, + "epoch": 0.9598989959718631, + "flos": 60098124055680.0, + "grad_norm": 0.7247936510611176, + "language_loss": 0.61941171, + "learning_rate": 1.6798205559482638e-08, + "loss": 0.63994765, + "num_input_tokens_seen": 172343075, + "step": 7983, + "time_per_iteration": 3.1807339191436768 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.04594326, + "balance_loss_mlp": 1.0198071, + "epoch": 0.9600192388625023, + "flos": 20886795624960.0, + "grad_norm": 2.5727226969359105, + "language_loss": 0.76545107, + "learning_rate": 1.669760733251713e-08, + "loss": 0.78699386, + "num_input_tokens_seen": 172361950, + "step": 7984, + "time_per_iteration": 2.5094847679138184 + }, + { + "auxiliary_loss_clip": 0.01104054, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.04141057, + "balance_loss_mlp": 1.0167048, + "epoch": 0.9601394817531413, + "flos": 20445740524800.0, + "grad_norm": 1.6676778808878976, + "language_loss": 0.8231622, + "learning_rate": 1.659730997094755e-08, + "loss": 0.84443456, + "num_input_tokens_seen": 172380440, + "step": 7985, + "time_per_iteration": 2.5675392150878906 + }, + { + "auxiliary_loss_clip": 0.01143451, + "auxiliary_loss_mlp": 0.01023737, + "balance_loss_clip": 1.04309511, + "balance_loss_mlp": 1.01680207, + "epoch": 0.9602597246437804, + "flos": 21507152440320.0, + "grad_norm": 2.189371102896242, + "language_loss": 0.62065828, + "learning_rate": 1.6497313489989283e-08, + "loss": 0.64233017, + "num_input_tokens_seen": 172400265, + "step": 7986, + "time_per_iteration": 3.309988021850586 + }, + { + "auxiliary_loss_clip": 0.0110739, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.0347755, + "balance_loss_mlp": 1.01708341, + "epoch": 0.9603799675344196, + "flos": 29935099152000.0, + "grad_norm": 2.176662426371312, + "language_loss": 0.70074391, + "learning_rate": 1.639761790481131e-08, + "loss": 0.72206306, + "num_input_tokens_seen": 172421145, + "step": 7987, + "time_per_iteration": 2.610603094100952 + }, + { + "auxiliary_loss_clip": 0.01153397, + "auxiliary_loss_mlp": 0.01022966, + "balance_loss_clip": 1.04622841, + "balance_loss_mlp": 1.01599848, + "epoch": 0.9605002104250586, + "flos": 28001525103360.0, + "grad_norm": 1.8062192392037126, + "language_loss": 0.79249227, + "learning_rate": 1.6298223230537754e-08, + "loss": 0.81425589, + "num_input_tokens_seen": 172438945, + "step": 7988, + "time_per_iteration": 2.484236717224121 + }, + { + "auxiliary_loss_clip": 0.01133696, + "auxiliary_loss_mlp": 0.00761984, + "balance_loss_clip": 1.04295492, + "balance_loss_mlp": 1.00039208, + "epoch": 0.9606204533156977, + "flos": 35590490870400.0, + "grad_norm": 2.976650739555826, + "language_loss": 0.69689834, + "learning_rate": 1.619912948224611e-08, + "loss": 0.71585512, + "num_input_tokens_seen": 172460150, + "step": 7989, + "time_per_iteration": 4.188363552093506 + }, + { + "auxiliary_loss_clip": 0.01118726, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.04359126, + "balance_loss_mlp": 1.0212115, + "epoch": 0.9607406962063368, + "flos": 26574614346240.0, + "grad_norm": 2.5082978190502794, + "language_loss": 0.6079818, + "learning_rate": 1.6100336674969682e-08, + "loss": 0.62945962, + "num_input_tokens_seen": 172478990, + "step": 7990, + "time_per_iteration": 2.54133939743042 + }, + { + "auxiliary_loss_clip": 0.01112353, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.04011869, + "balance_loss_mlp": 1.02298164, + "epoch": 0.9608609390969759, + "flos": 25331781813120.0, + "grad_norm": 1.9375775869444478, + "language_loss": 0.76695067, + "learning_rate": 1.600184482369449e-08, + "loss": 0.78838027, + "num_input_tokens_seen": 172498905, + "step": 7991, + "time_per_iteration": 2.5657451152801514 + }, + { + "auxiliary_loss_clip": 0.01126856, + "auxiliary_loss_mlp": 0.01022371, + "balance_loss_clip": 1.04248345, + "balance_loss_mlp": 1.01446128, + "epoch": 0.960981181987615, + "flos": 21069114082560.0, + "grad_norm": 2.6783739872364594, + "language_loss": 0.8941471, + "learning_rate": 1.5903653943362126e-08, + "loss": 0.9156394, + "num_input_tokens_seen": 172517900, + "step": 7992, + "time_per_iteration": 2.4946417808532715 + }, + { + "auxiliary_loss_clip": 0.01137278, + "auxiliary_loss_mlp": 0.01022231, + "balance_loss_clip": 1.04416943, + "balance_loss_mlp": 1.01561487, + "epoch": 0.9611014248782541, + "flos": 17823256554240.0, + "grad_norm": 1.9655420976840847, + "language_loss": 0.76993406, + "learning_rate": 1.580576404886802e-08, + "loss": 0.79152912, + "num_input_tokens_seen": 172536430, + "step": 7993, + "time_per_iteration": 2.4922561645507812 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.04450226, + "balance_loss_mlp": 1.01886058, + "epoch": 0.9612216677688932, + "flos": 19354631040000.0, + "grad_norm": 2.9965339583526482, + "language_loss": 0.79640782, + "learning_rate": 1.570817515506162e-08, + "loss": 0.81816292, + "num_input_tokens_seen": 172555120, + "step": 7994, + "time_per_iteration": 2.429800033569336 + }, + { + "auxiliary_loss_clip": 0.01162984, + "auxiliary_loss_mlp": 0.01023004, + "balance_loss_clip": 1.04747438, + "balance_loss_mlp": 1.01651645, + "epoch": 0.9613419106595322, + "flos": 15808739207040.0, + "grad_norm": 2.1654653277555913, + "language_loss": 0.81549096, + "learning_rate": 1.561088727674753e-08, + "loss": 0.83735085, + "num_input_tokens_seen": 172569330, + "step": 7995, + "time_per_iteration": 3.1712207794189453 + }, + { + "auxiliary_loss_clip": 0.01124153, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.04307365, + "balance_loss_mlp": 1.02313077, + "epoch": 0.9614621535501714, + "flos": 25702488126720.0, + "grad_norm": 4.480018090819754, + "language_loss": 0.71564418, + "learning_rate": 1.551390042868417e-08, + "loss": 0.73719752, + "num_input_tokens_seen": 172591100, + "step": 7996, + "time_per_iteration": 2.6296262741088867 + }, + { + "auxiliary_loss_clip": 0.0115296, + "auxiliary_loss_mlp": 0.01022448, + "balance_loss_clip": 1.04740715, + "balance_loss_mlp": 1.01515555, + "epoch": 0.9615823964408104, + "flos": 17819054663040.0, + "grad_norm": 2.4070389326038573, + "language_loss": 0.70827353, + "learning_rate": 1.5417214625584207e-08, + "loss": 0.73002762, + "num_input_tokens_seen": 172608755, + "step": 7997, + "time_per_iteration": 2.4281885623931885 + }, + { + "auxiliary_loss_clip": 0.01145111, + "auxiliary_loss_mlp": 0.01021469, + "balance_loss_clip": 1.04231751, + "balance_loss_mlp": 1.01415002, + "epoch": 0.9617026393314495, + "flos": 20190020624640.0, + "grad_norm": 1.7035342154509898, + "language_loss": 0.8509208, + "learning_rate": 1.5320829882114806e-08, + "loss": 0.87258661, + "num_input_tokens_seen": 172626830, + "step": 7998, + "time_per_iteration": 2.4530656337738037 + }, + { + "auxiliary_loss_clip": 0.01162587, + "auxiliary_loss_mlp": 0.01024893, + "balance_loss_clip": 1.04414797, + "balance_loss_mlp": 1.01784742, + "epoch": 0.9618228822220887, + "flos": 20267013427200.0, + "grad_norm": 2.103792161629396, + "language_loss": 0.7945298, + "learning_rate": 1.5224746212897378e-08, + "loss": 0.81640458, + "num_input_tokens_seen": 172646125, + "step": 7999, + "time_per_iteration": 2.397569179534912 + }, + { + "auxiliary_loss_clip": 0.01161861, + "auxiliary_loss_mlp": 0.01022478, + "balance_loss_clip": 1.04578233, + "balance_loss_mlp": 1.01566219, + "epoch": 0.9619431251127277, + "flos": 21031300039680.0, + "grad_norm": 1.7070868867199418, + "language_loss": 0.77420163, + "learning_rate": 1.512896363250804e-08, + "loss": 0.79604501, + "num_input_tokens_seen": 172666235, + "step": 8000, + "time_per_iteration": 2.4012913703918457 + }, + { + "auxiliary_loss_clip": 0.01151927, + "auxiliary_loss_mlp": 0.01024658, + "balance_loss_clip": 1.04453051, + "balance_loss_mlp": 1.01761556, + "epoch": 0.9620633680033668, + "flos": 22382654538240.0, + "grad_norm": 2.0752732430047525, + "language_loss": 0.75432396, + "learning_rate": 1.503348215547673e-08, + "loss": 0.77608979, + "num_input_tokens_seen": 172687325, + "step": 8001, + "time_per_iteration": 2.463216781616211 + }, + { + "auxiliary_loss_clip": 0.01133512, + "auxiliary_loss_mlp": 0.01024969, + "balance_loss_clip": 1.04337692, + "balance_loss_mlp": 1.01809025, + "epoch": 0.962183610894006, + "flos": 18471730740480.0, + "grad_norm": 1.7598745730702738, + "language_loss": 0.80655944, + "learning_rate": 1.4938301796288078e-08, + "loss": 0.82814425, + "num_input_tokens_seen": 172703895, + "step": 8002, + "time_per_iteration": 2.439070701599121 + }, + { + "auxiliary_loss_clip": 0.01163782, + "auxiliary_loss_mlp": 0.01022461, + "balance_loss_clip": 1.04655981, + "balance_loss_mlp": 1.01488554, + "epoch": 0.962303853784645, + "flos": 18435245500800.0, + "grad_norm": 3.8070569528311458, + "language_loss": 0.81698322, + "learning_rate": 1.4843422569380537e-08, + "loss": 0.83884573, + "num_input_tokens_seen": 172720650, + "step": 8003, + "time_per_iteration": 2.3881473541259766 + }, + { + "auxiliary_loss_clip": 0.01105355, + "auxiliary_loss_mlp": 0.01022258, + "balance_loss_clip": 1.03931236, + "balance_loss_mlp": 1.01543045, + "epoch": 0.9624240966752841, + "flos": 26391074826240.0, + "grad_norm": 2.2307647353505273, + "language_loss": 0.82862902, + "learning_rate": 1.4748844489147483e-08, + "loss": 0.84990513, + "num_input_tokens_seen": 172737640, + "step": 8004, + "time_per_iteration": 2.5483381748199463 + }, + { + "auxiliary_loss_clip": 0.01135586, + "auxiliary_loss_mlp": 0.0102131, + "balance_loss_clip": 1.04193461, + "balance_loss_mlp": 1.01512051, + "epoch": 0.9625443395659231, + "flos": 14647675985280.0, + "grad_norm": 1.819601697079657, + "language_loss": 0.71162355, + "learning_rate": 1.4654567569936326e-08, + "loss": 0.7331925, + "num_input_tokens_seen": 172755215, + "step": 8005, + "time_per_iteration": 2.44250750541687 + }, + { + "auxiliary_loss_clip": 0.01103186, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.03991902, + "balance_loss_mlp": 1.02194762, + "epoch": 0.9626645824565623, + "flos": 18367626147840.0, + "grad_norm": 1.866163557316775, + "language_loss": 0.83024216, + "learning_rate": 1.456059182604874e-08, + "loss": 0.85156333, + "num_input_tokens_seen": 172774020, + "step": 8006, + "time_per_iteration": 2.5249311923980713 + }, + { + "auxiliary_loss_clip": 0.01165618, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.04809225, + "balance_loss_mlp": 1.01969528, + "epoch": 0.9627848253472013, + "flos": 16580424021120.0, + "grad_norm": 2.0115270960916143, + "language_loss": 0.76493579, + "learning_rate": 1.4466917271740653e-08, + "loss": 0.78686398, + "num_input_tokens_seen": 172792220, + "step": 8007, + "time_per_iteration": 2.3745837211608887 + }, + { + "auxiliary_loss_clip": 0.01132993, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.0425992, + "balance_loss_mlp": 1.01844716, + "epoch": 0.9629050682378404, + "flos": 20886867452160.0, + "grad_norm": 1.8869264242336798, + "language_loss": 0.68009061, + "learning_rate": 1.4373543921222697e-08, + "loss": 0.70167816, + "num_input_tokens_seen": 172811805, + "step": 8008, + "time_per_iteration": 2.478329658508301 + }, + { + "auxiliary_loss_clip": 0.01134216, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.04455376, + "balance_loss_mlp": 1.01825011, + "epoch": 0.9630253111284796, + "flos": 17019252478080.0, + "grad_norm": 1.819200845425801, + "language_loss": 0.78233594, + "learning_rate": 1.428047178865932e-08, + "loss": 0.80393189, + "num_input_tokens_seen": 172828595, + "step": 8009, + "time_per_iteration": 2.444878578186035 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.04145074, + "balance_loss_mlp": 1.01797211, + "epoch": 0.9631455540191186, + "flos": 20338942412160.0, + "grad_norm": 1.6557753227576513, + "language_loss": 0.74411094, + "learning_rate": 1.4187700888169451e-08, + "loss": 0.76571333, + "num_input_tokens_seen": 172847770, + "step": 8010, + "time_per_iteration": 2.4842169284820557 + }, + { + "auxiliary_loss_clip": 0.01050607, + "auxiliary_loss_mlp": 0.01000932, + "balance_loss_clip": 1.00816846, + "balance_loss_mlp": 1.00010931, + "epoch": 0.9632657969097577, + "flos": 65956700033280.0, + "grad_norm": 0.7521184666515475, + "language_loss": 0.57041264, + "learning_rate": 1.40952312338265e-08, + "loss": 0.59092808, + "num_input_tokens_seen": 172912415, + "step": 8011, + "time_per_iteration": 3.1016392707824707 + }, + { + "auxiliary_loss_clip": 0.01123823, + "auxiliary_loss_mlp": 0.01024639, + "balance_loss_clip": 1.04143739, + "balance_loss_mlp": 1.01772463, + "epoch": 0.9633860398003968, + "flos": 44419523823360.0, + "grad_norm": 1.8663555050622491, + "language_loss": 0.68743664, + "learning_rate": 1.4003062839657909e-08, + "loss": 0.70892125, + "num_input_tokens_seen": 172934895, + "step": 8012, + "time_per_iteration": 3.6033921241760254 + }, + { + "auxiliary_loss_clip": 0.01124424, + "auxiliary_loss_mlp": 0.0101895, + "balance_loss_clip": 1.04278994, + "balance_loss_mlp": 1.01215243, + "epoch": 0.9635062826910359, + "flos": 24827704300800.0, + "grad_norm": 1.6342624660680392, + "language_loss": 0.79802448, + "learning_rate": 1.391119571964583e-08, + "loss": 0.81945819, + "num_input_tokens_seen": 172955835, + "step": 8013, + "time_per_iteration": 2.540250778198242 + }, + { + "auxiliary_loss_clip": 0.01150097, + "auxiliary_loss_mlp": 0.01026722, + "balance_loss_clip": 1.04688573, + "balance_loss_mlp": 1.01936686, + "epoch": 0.9636265255816749, + "flos": 15961360095360.0, + "grad_norm": 1.7553171824376277, + "language_loss": 0.72552949, + "learning_rate": 1.3819629887726225e-08, + "loss": 0.74729764, + "num_input_tokens_seen": 172973925, + "step": 8014, + "time_per_iteration": 2.42315936088562 + }, + { + "auxiliary_loss_clip": 0.01142499, + "auxiliary_loss_mlp": 0.01023382, + "balance_loss_clip": 1.04757643, + "balance_loss_mlp": 1.01628637, + "epoch": 0.9637467684723141, + "flos": 22601781457920.0, + "grad_norm": 2.148582884711216, + "language_loss": 0.76414245, + "learning_rate": 1.3728365357789317e-08, + "loss": 0.78580129, + "num_input_tokens_seen": 172993290, + "step": 8015, + "time_per_iteration": 3.309800386428833 + }, + { + "auxiliary_loss_clip": 0.01086686, + "auxiliary_loss_mlp": 0.01021758, + "balance_loss_clip": 1.0387907, + "balance_loss_mlp": 1.01396489, + "epoch": 0.9638670113629532, + "flos": 17565812801280.0, + "grad_norm": 2.9348131178571517, + "language_loss": 0.76806539, + "learning_rate": 1.3637402143680254e-08, + "loss": 0.78914988, + "num_input_tokens_seen": 173008190, + "step": 8016, + "time_per_iteration": 3.4087624549865723 + }, + { + "auxiliary_loss_clip": 0.01027878, + "auxiliary_loss_mlp": 0.01005326, + "balance_loss_clip": 1.01040268, + "balance_loss_mlp": 1.00456893, + "epoch": 0.9639872542535922, + "flos": 55072139379840.0, + "grad_norm": 0.7240401755084293, + "language_loss": 0.55069923, + "learning_rate": 1.3546740259197998e-08, + "loss": 0.57103133, + "num_input_tokens_seen": 173061000, + "step": 8017, + "time_per_iteration": 3.0416760444641113 + }, + { + "auxiliary_loss_clip": 0.01136952, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.04455602, + "balance_loss_mlp": 1.01946592, + "epoch": 0.9641074971442314, + "flos": 24134484746880.0, + "grad_norm": 2.3182985443941777, + "language_loss": 0.70437837, + "learning_rate": 1.3456379718095989e-08, + "loss": 0.7260198, + "num_input_tokens_seen": 173081415, + "step": 8018, + "time_per_iteration": 2.493027448654175 + }, + { + "auxiliary_loss_clip": 0.01038254, + "auxiliary_loss_mlp": 0.010025, + "balance_loss_clip": 1.0074898, + "balance_loss_mlp": 1.00148702, + "epoch": 0.9642277400348704, + "flos": 66747416077440.0, + "grad_norm": 0.8401229538323925, + "language_loss": 0.6205194, + "learning_rate": 1.3366320534081487e-08, + "loss": 0.64092696, + "num_input_tokens_seen": 173144095, + "step": 8019, + "time_per_iteration": 3.0780186653137207 + }, + { + "auxiliary_loss_clip": 0.01150467, + "auxiliary_loss_mlp": 0.01022583, + "balance_loss_clip": 1.04524851, + "balance_loss_mlp": 1.01557374, + "epoch": 0.9643479829255095, + "flos": 30920272450560.0, + "grad_norm": 2.3020841670504706, + "language_loss": 0.762299, + "learning_rate": 1.3276562720816675e-08, + "loss": 0.78402954, + "num_input_tokens_seen": 173165605, + "step": 8020, + "time_per_iteration": 2.5196034908294678 + }, + { + "auxiliary_loss_clip": 0.01164528, + "auxiliary_loss_mlp": 0.01024522, + "balance_loss_clip": 1.04593754, + "balance_loss_mlp": 1.0171932, + "epoch": 0.9644682258161487, + "flos": 20048245643520.0, + "grad_norm": 2.2740153508614322, + "language_loss": 0.82784462, + "learning_rate": 1.3187106291917549e-08, + "loss": 0.84973514, + "num_input_tokens_seen": 173182595, + "step": 8021, + "time_per_iteration": 2.4315030574798584 + }, + { + "auxiliary_loss_clip": 0.0114628, + "auxiliary_loss_mlp": 0.01020621, + "balance_loss_clip": 1.04464972, + "balance_loss_mlp": 1.01437151, + "epoch": 0.9645884687067877, + "flos": 21178713456000.0, + "grad_norm": 1.6772075495342211, + "language_loss": 0.70642328, + "learning_rate": 1.309795126095503e-08, + "loss": 0.72809225, + "num_input_tokens_seen": 173200895, + "step": 8022, + "time_per_iteration": 3.219061851501465 + }, + { + "auxiliary_loss_clip": 0.01077201, + "auxiliary_loss_mlp": 0.01023354, + "balance_loss_clip": 1.0384481, + "balance_loss_mlp": 1.01615071, + "epoch": 0.9647087115974268, + "flos": 18945967029120.0, + "grad_norm": 2.0479811731810837, + "language_loss": 0.80459267, + "learning_rate": 1.3009097641453192e-08, + "loss": 0.82559824, + "num_input_tokens_seen": 173218745, + "step": 8023, + "time_per_iteration": 2.6150705814361572 + }, + { + "auxiliary_loss_clip": 0.01138224, + "auxiliary_loss_mlp": 0.01020576, + "balance_loss_clip": 1.0463717, + "balance_loss_mlp": 1.01352477, + "epoch": 0.9648289544880659, + "flos": 16545088016640.0, + "grad_norm": 1.6444242924151706, + "language_loss": 0.75677907, + "learning_rate": 1.2920545446891474e-08, + "loss": 0.77836704, + "num_input_tokens_seen": 173235465, + "step": 8024, + "time_per_iteration": 2.4677605628967285 + }, + { + "auxiliary_loss_clip": 0.01138745, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.04642177, + "balance_loss_mlp": 1.02527535, + "epoch": 0.964949197378705, + "flos": 24057527857920.0, + "grad_norm": 1.731194863134548, + "language_loss": 0.70507318, + "learning_rate": 1.2832294690703127e-08, + "loss": 0.72678733, + "num_input_tokens_seen": 173254440, + "step": 8025, + "time_per_iteration": 2.5037741661071777 + }, + { + "auxiliary_loss_clip": 0.01151583, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.04662836, + "balance_loss_mlp": 1.01701283, + "epoch": 0.965069440269344, + "flos": 23365565280000.0, + "grad_norm": 2.622304409622533, + "language_loss": 0.77384508, + "learning_rate": 1.2744345386275668e-08, + "loss": 0.79560471, + "num_input_tokens_seen": 173273980, + "step": 8026, + "time_per_iteration": 2.452924966812134 + }, + { + "auxiliary_loss_clip": 0.01146549, + "auxiliary_loss_mlp": 0.01023452, + "balance_loss_clip": 1.04961061, + "balance_loss_mlp": 1.01627266, + "epoch": 0.9651896831599832, + "flos": 25374875155200.0, + "grad_norm": 2.094194159590921, + "language_loss": 0.78486365, + "learning_rate": 1.265669754695109e-08, + "loss": 0.80656362, + "num_input_tokens_seen": 173293550, + "step": 8027, + "time_per_iteration": 2.525113344192505 + }, + { + "auxiliary_loss_clip": 0.01097247, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.0392282, + "balance_loss_mlp": 1.01872373, + "epoch": 0.9653099260506223, + "flos": 22272875596800.0, + "grad_norm": 1.9654110756514116, + "language_loss": 0.8197763, + "learning_rate": 1.2569351186025201e-08, + "loss": 0.84101051, + "num_input_tokens_seen": 173312005, + "step": 8028, + "time_per_iteration": 2.5802760124206543 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01022687, + "balance_loss_clip": 1.03895581, + "balance_loss_mlp": 1.01583862, + "epoch": 0.9654301689412613, + "flos": 26760847386240.0, + "grad_norm": 1.4557781139830313, + "language_loss": 0.75236416, + "learning_rate": 1.2482306316748737e-08, + "loss": 0.77368289, + "num_input_tokens_seen": 173332450, + "step": 8029, + "time_per_iteration": 2.558187961578369 + }, + { + "auxiliary_loss_clip": 0.01155324, + "auxiliary_loss_mlp": 0.01021752, + "balance_loss_clip": 1.04522073, + "balance_loss_mlp": 1.01528478, + "epoch": 0.9655504118319005, + "flos": 17412689122560.0, + "grad_norm": 2.8016751496578234, + "language_loss": 0.78271544, + "learning_rate": 1.2395562952326021e-08, + "loss": 0.80448627, + "num_input_tokens_seen": 173349610, + "step": 8030, + "time_per_iteration": 2.42197585105896 + }, + { + "auxiliary_loss_clip": 0.01146571, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.04655778, + "balance_loss_mlp": 1.02334261, + "epoch": 0.9656706547225395, + "flos": 22126970551680.0, + "grad_norm": 2.689096297879985, + "language_loss": 0.81186926, + "learning_rate": 1.2309121105916309e-08, + "loss": 0.83365124, + "num_input_tokens_seen": 173367900, + "step": 8031, + "time_per_iteration": 2.510634422302246 + }, + { + "auxiliary_loss_clip": 0.01153724, + "auxiliary_loss_mlp": 0.01022617, + "balance_loss_clip": 1.0461762, + "balance_loss_mlp": 1.01580405, + "epoch": 0.9657908976131786, + "flos": 37049289926400.0, + "grad_norm": 1.942218903475128, + "language_loss": 0.69271475, + "learning_rate": 1.222298079063222e-08, + "loss": 0.71447814, + "num_input_tokens_seen": 173389040, + "step": 8032, + "time_per_iteration": 2.573167085647583 + }, + { + "auxiliary_loss_clip": 0.01148338, + "auxiliary_loss_mlp": 0.0102345, + "balance_loss_clip": 1.04495907, + "balance_loss_mlp": 1.01681042, + "epoch": 0.9659111405038178, + "flos": 24389809597440.0, + "grad_norm": 2.367894053191486, + "language_loss": 0.7275871, + "learning_rate": 1.2137142019541524e-08, + "loss": 0.74930501, + "num_input_tokens_seen": 173407595, + "step": 8033, + "time_per_iteration": 2.483229637145996 + }, + { + "auxiliary_loss_clip": 0.0114312, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.04368556, + "balance_loss_mlp": 1.01894808, + "epoch": 0.9660313833944568, + "flos": 25009412227200.0, + "grad_norm": 2.0519206501182703, + "language_loss": 0.73214149, + "learning_rate": 1.2051604805666027e-08, + "loss": 0.75382757, + "num_input_tokens_seen": 173424720, + "step": 8034, + "time_per_iteration": 2.508850574493408 + }, + { + "auxiliary_loss_clip": 0.01164758, + "auxiliary_loss_mlp": 0.00761615, + "balance_loss_clip": 1.04720926, + "balance_loss_mlp": 1.0004679, + "epoch": 0.9661516262850959, + "flos": 11801575895040.0, + "grad_norm": 2.2018230336157836, + "language_loss": 0.78638655, + "learning_rate": 1.196636916198135e-08, + "loss": 0.80565029, + "num_input_tokens_seen": 173442260, + "step": 8035, + "time_per_iteration": 2.434105634689331 + }, + { + "auxiliary_loss_clip": 0.01166037, + "auxiliary_loss_mlp": 0.01021356, + "balance_loss_clip": 1.04735994, + "balance_loss_mlp": 1.01456451, + "epoch": 0.9662718691757349, + "flos": 20047778766720.0, + "grad_norm": 2.3415810863217192, + "language_loss": 0.76836479, + "learning_rate": 1.1881435101418036e-08, + "loss": 0.79023874, + "num_input_tokens_seen": 173461675, + "step": 8036, + "time_per_iteration": 2.4042298793792725 + }, + { + "auxiliary_loss_clip": 0.01040999, + "auxiliary_loss_mlp": 0.01000853, + "balance_loss_clip": 1.0084275, + "balance_loss_mlp": 1.00004792, + "epoch": 0.9663921120663741, + "flos": 68027703517440.0, + "grad_norm": 0.7314223204400929, + "language_loss": 0.65539575, + "learning_rate": 1.1796802636860003e-08, + "loss": 0.67581427, + "num_input_tokens_seen": 173530205, + "step": 8037, + "time_per_iteration": 3.1086549758911133 + }, + { + "auxiliary_loss_clip": 0.01164308, + "auxiliary_loss_mlp": 0.01025713, + "balance_loss_clip": 1.04581904, + "balance_loss_mlp": 1.01864982, + "epoch": 0.9665123549570132, + "flos": 26322916769280.0, + "grad_norm": 2.6093817005600024, + "language_loss": 0.73843199, + "learning_rate": 1.1712471781146316e-08, + "loss": 0.76033217, + "num_input_tokens_seen": 173549540, + "step": 8038, + "time_per_iteration": 2.4813179969787598 + }, + { + "auxiliary_loss_clip": 0.01162232, + "auxiliary_loss_mlp": 0.01024857, + "balance_loss_clip": 1.04458714, + "balance_loss_mlp": 1.01766288, + "epoch": 0.9666325978476522, + "flos": 43941121557120.0, + "grad_norm": 2.4006004273797945, + "language_loss": 0.66401631, + "learning_rate": 1.1628442547069628e-08, + "loss": 0.68588722, + "num_input_tokens_seen": 173571740, + "step": 8039, + "time_per_iteration": 3.4404666423797607 + }, + { + "auxiliary_loss_clip": 0.01154698, + "auxiliary_loss_mlp": 0.00762184, + "balance_loss_clip": 1.04512525, + "balance_loss_mlp": 1.00049806, + "epoch": 0.9667528407382914, + "flos": 21543422198400.0, + "grad_norm": 1.869458742041927, + "language_loss": 0.77185112, + "learning_rate": 1.1544714947377521e-08, + "loss": 0.79101992, + "num_input_tokens_seen": 173589425, + "step": 8040, + "time_per_iteration": 2.4575209617614746 + }, + { + "auxiliary_loss_clip": 0.011663, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.04730177, + "balance_loss_mlp": 1.01940536, + "epoch": 0.9668730836289304, + "flos": 23878585278720.0, + "grad_norm": 2.0660478233787587, + "language_loss": 0.70098913, + "learning_rate": 1.1461288994770945e-08, + "loss": 0.72292686, + "num_input_tokens_seen": 173608500, + "step": 8041, + "time_per_iteration": 2.4305317401885986 + }, + { + "auxiliary_loss_clip": 0.01165788, + "auxiliary_loss_mlp": 0.0102665, + "balance_loss_clip": 1.04506731, + "balance_loss_mlp": 1.01915205, + "epoch": 0.9669933265195695, + "flos": 28293011971200.0, + "grad_norm": 1.8540090468184476, + "language_loss": 0.77420712, + "learning_rate": 1.1378164701906002e-08, + "loss": 0.79613149, + "num_input_tokens_seen": 173630265, + "step": 8042, + "time_per_iteration": 3.2934224605560303 + }, + { + "auxiliary_loss_clip": 0.0116739, + "auxiliary_loss_mlp": 0.0102494, + "balance_loss_clip": 1.04746318, + "balance_loss_mlp": 1.01774251, + "epoch": 0.9671135694102087, + "flos": 22454763091200.0, + "grad_norm": 1.7582208785185078, + "language_loss": 0.67034984, + "learning_rate": 1.1295342081392156e-08, + "loss": 0.69227314, + "num_input_tokens_seen": 173649625, + "step": 8043, + "time_per_iteration": 3.2975029945373535 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01021146, + "balance_loss_clip": 1.04324889, + "balance_loss_mlp": 1.01383877, + "epoch": 0.9672338123008477, + "flos": 20155941596160.0, + "grad_norm": 1.6501541485930686, + "language_loss": 0.69209087, + "learning_rate": 1.1212821145793804e-08, + "loss": 0.71368718, + "num_input_tokens_seen": 173669240, + "step": 8044, + "time_per_iteration": 2.472506284713745 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.04343355, + "balance_loss_mlp": 1.01865637, + "epoch": 0.9673540551914868, + "flos": 16977487939200.0, + "grad_norm": 1.965621934423793, + "language_loss": 0.78626651, + "learning_rate": 1.1130601907629156e-08, + "loss": 0.80790061, + "num_input_tokens_seen": 173686970, + "step": 8045, + "time_per_iteration": 2.465703010559082 + }, + { + "auxiliary_loss_clip": 0.01052286, + "auxiliary_loss_mlp": 0.01001207, + "balance_loss_clip": 1.00705481, + "balance_loss_mlp": 1.00024748, + "epoch": 0.9674742980821259, + "flos": 61892903952000.0, + "grad_norm": 0.8310061385023739, + "language_loss": 0.64818013, + "learning_rate": 1.1048684379370899e-08, + "loss": 0.668715, + "num_input_tokens_seen": 173747655, + "step": 8046, + "time_per_iteration": 3.0365893840789795 + }, + { + "auxiliary_loss_clip": 0.01127874, + "auxiliary_loss_mlp": 0.01023229, + "balance_loss_clip": 1.04432428, + "balance_loss_mlp": 1.0167743, + "epoch": 0.967594540972765, + "flos": 18697824898560.0, + "grad_norm": 1.975165150900015, + "language_loss": 0.74594229, + "learning_rate": 1.0967068573445759e-08, + "loss": 0.76745331, + "num_input_tokens_seen": 173765140, + "step": 8047, + "time_per_iteration": 2.4526827335357666 + }, + { + "auxiliary_loss_clip": 0.01131824, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.04140198, + "balance_loss_mlp": 1.01767612, + "epoch": 0.967714783863404, + "flos": 20777411733120.0, + "grad_norm": 2.2523430456295594, + "language_loss": 0.65201139, + "learning_rate": 1.0885754502234945e-08, + "loss": 0.67357659, + "num_input_tokens_seen": 173784800, + "step": 8048, + "time_per_iteration": 2.486752510070801 + }, + { + "auxiliary_loss_clip": 0.01120488, + "auxiliary_loss_mlp": 0.01022743, + "balance_loss_clip": 1.04309964, + "balance_loss_mlp": 1.01585615, + "epoch": 0.9678350267540432, + "flos": 23185473465600.0, + "grad_norm": 2.930101495036063, + "language_loss": 0.77860063, + "learning_rate": 1.08047421780737e-08, + "loss": 0.80003291, + "num_input_tokens_seen": 173803990, + "step": 8049, + "time_per_iteration": 3.294926166534424 + }, + { + "auxiliary_loss_clip": 0.01143411, + "auxiliary_loss_mlp": 0.00761413, + "balance_loss_clip": 1.04477239, + "balance_loss_mlp": 1.00044656, + "epoch": 0.9679552696446823, + "flos": 21726063878400.0, + "grad_norm": 2.39012814248842, + "language_loss": 0.73707503, + "learning_rate": 1.0724031613251305e-08, + "loss": 0.75612324, + "num_input_tokens_seen": 173821890, + "step": 8050, + "time_per_iteration": 2.4826231002807617 + }, + { + "auxiliary_loss_clip": 0.01158448, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.04752302, + "balance_loss_mlp": 1.02058959, + "epoch": 0.9680755125353213, + "flos": 26869046129280.0, + "grad_norm": 2.1964798174937377, + "language_loss": 0.66212511, + "learning_rate": 1.0643622820011744e-08, + "loss": 0.6839906, + "num_input_tokens_seen": 173842945, + "step": 8051, + "time_per_iteration": 2.506913423538208 + }, + { + "auxiliary_loss_clip": 0.01167649, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.04657817, + "balance_loss_mlp": 1.01967049, + "epoch": 0.9681957554259605, + "flos": 28325008010880.0, + "grad_norm": 3.250580618000372, + "language_loss": 0.67942607, + "learning_rate": 1.0563515810552814e-08, + "loss": 0.70137715, + "num_input_tokens_seen": 173859915, + "step": 8052, + "time_per_iteration": 2.4511044025421143 + }, + { + "auxiliary_loss_clip": 0.01167943, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.05024862, + "balance_loss_mlp": 1.01860392, + "epoch": 0.9683159983165995, + "flos": 20557674282240.0, + "grad_norm": 1.545115350126205, + "language_loss": 0.73504066, + "learning_rate": 1.0483710597026795e-08, + "loss": 0.75696933, + "num_input_tokens_seen": 173879775, + "step": 8053, + "time_per_iteration": 2.428269386291504 + }, + { + "auxiliary_loss_clip": 0.01121768, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.0420028, + "balance_loss_mlp": 1.02030206, + "epoch": 0.9684362412072386, + "flos": 24207958016640.0, + "grad_norm": 1.9586683093633246, + "language_loss": 0.74320185, + "learning_rate": 1.0404207191540227e-08, + "loss": 0.76469362, + "num_input_tokens_seen": 173900230, + "step": 8054, + "time_per_iteration": 2.552485704421997 + }, + { + "auxiliary_loss_clip": 0.01163779, + "auxiliary_loss_mlp": 0.01023646, + "balance_loss_clip": 1.04628396, + "balance_loss_mlp": 1.016675, + "epoch": 0.9685564840978778, + "flos": 22346241125760.0, + "grad_norm": 1.9054769218514387, + "language_loss": 0.74835038, + "learning_rate": 1.0325005606153236e-08, + "loss": 0.77022457, + "num_input_tokens_seen": 173919690, + "step": 8055, + "time_per_iteration": 2.4212632179260254 + }, + { + "auxiliary_loss_clip": 0.01112163, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.04104924, + "balance_loss_mlp": 1.01762271, + "epoch": 0.9686767269885168, + "flos": 14386389477120.0, + "grad_norm": 2.4766435932876942, + "language_loss": 0.78819972, + "learning_rate": 1.0246105852881104e-08, + "loss": 0.80956542, + "num_input_tokens_seen": 173934790, + "step": 8056, + "time_per_iteration": 2.5299460887908936 + }, + { + "auxiliary_loss_clip": 0.01165878, + "auxiliary_loss_mlp": 0.0102007, + "balance_loss_clip": 1.0464952, + "balance_loss_mlp": 1.01280379, + "epoch": 0.9687969698791559, + "flos": 21287630471040.0, + "grad_norm": 1.9109274534879837, + "language_loss": 0.78610182, + "learning_rate": 1.0167507943692476e-08, + "loss": 0.80796129, + "num_input_tokens_seen": 173953875, + "step": 8057, + "time_per_iteration": 2.458423137664795 + }, + { + "auxiliary_loss_clip": 0.01148511, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.04701614, + "balance_loss_mlp": 1.02248335, + "epoch": 0.968917212769795, + "flos": 19828328624640.0, + "grad_norm": 2.2689689022155544, + "language_loss": 0.71478963, + "learning_rate": 1.008921189051093e-08, + "loss": 0.73657244, + "num_input_tokens_seen": 173971220, + "step": 8058, + "time_per_iteration": 2.424872875213623 + }, + { + "auxiliary_loss_clip": 0.01166278, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.04768538, + "balance_loss_mlp": 1.01653421, + "epoch": 0.9690374556604341, + "flos": 21681749473920.0, + "grad_norm": 1.9392450467574212, + "language_loss": 0.77505368, + "learning_rate": 1.0011217705213848e-08, + "loss": 0.79695344, + "num_input_tokens_seen": 173989095, + "step": 8059, + "time_per_iteration": 2.43770432472229 + }, + { + "auxiliary_loss_clip": 0.01148331, + "auxiliary_loss_mlp": 0.01022937, + "balance_loss_clip": 1.04623878, + "balance_loss_mlp": 1.01673484, + "epoch": 0.9691576985510731, + "flos": 32635437851520.0, + "grad_norm": 2.123343916845261, + "language_loss": 0.74618512, + "learning_rate": 9.933525399632658e-09, + "loss": 0.76789784, + "num_input_tokens_seen": 174007330, + "step": 8060, + "time_per_iteration": 2.5370001792907715 + }, + { + "auxiliary_loss_clip": 0.01135209, + "auxiliary_loss_mlp": 0.01024343, + "balance_loss_clip": 1.0440166, + "balance_loss_mlp": 1.01672876, + "epoch": 0.9692779414417123, + "flos": 35663174040960.0, + "grad_norm": 1.7216145954454074, + "language_loss": 0.65012652, + "learning_rate": 9.856134985553488e-09, + "loss": 0.67172205, + "num_input_tokens_seen": 174027055, + "step": 8061, + "time_per_iteration": 2.6319658756256104 + }, + { + "auxiliary_loss_clip": 0.01165094, + "auxiliary_loss_mlp": 0.01023794, + "balance_loss_clip": 1.04694629, + "balance_loss_mlp": 1.01640892, + "epoch": 0.9693981843323514, + "flos": 28366952117760.0, + "grad_norm": 1.8314184513888254, + "language_loss": 0.73636782, + "learning_rate": 9.77904647471628e-09, + "loss": 0.75825667, + "num_input_tokens_seen": 174050235, + "step": 8062, + "time_per_iteration": 2.462043523788452 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01023935, + "balance_loss_clip": 1.03848267, + "balance_loss_mlp": 1.01683605, + "epoch": 0.9695184272229904, + "flos": 23622865378560.0, + "grad_norm": 1.627487402718025, + "language_loss": 0.73876345, + "learning_rate": 9.702259878815454e-09, + "loss": 0.76001179, + "num_input_tokens_seen": 174070560, + "step": 8063, + "time_per_iteration": 2.5861976146698 + }, + { + "auxiliary_loss_clip": 0.01154809, + "auxiliary_loss_mlp": 0.01025747, + "balance_loss_clip": 1.04831481, + "balance_loss_mlp": 1.01801002, + "epoch": 0.9696386701136296, + "flos": 23294677789440.0, + "grad_norm": 2.2755134982597247, + "language_loss": 0.74469686, + "learning_rate": 9.625775209499254e-09, + "loss": 0.76650244, + "num_input_tokens_seen": 174090565, + "step": 8064, + "time_per_iteration": 2.446993827819824 + }, + { + "auxiliary_loss_clip": 0.01117169, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.04014075, + "balance_loss_mlp": 1.01973653, + "epoch": 0.9697589130042686, + "flos": 15121876360320.0, + "grad_norm": 2.1073407652064695, + "language_loss": 0.74444348, + "learning_rate": 9.549592478370172e-09, + "loss": 0.76588207, + "num_input_tokens_seen": 174108745, + "step": 8065, + "time_per_iteration": 2.489091396331787 + }, + { + "auxiliary_loss_clip": 0.01151399, + "auxiliary_loss_mlp": 0.01022232, + "balance_loss_clip": 1.04410863, + "balance_loss_mlp": 1.0153296, + "epoch": 0.9698791558949077, + "flos": 18879532824960.0, + "grad_norm": 1.8332367474606057, + "language_loss": 0.79838645, + "learning_rate": 9.473711696985632e-09, + "loss": 0.82012278, + "num_input_tokens_seen": 174128075, + "step": 8066, + "time_per_iteration": 3.286348581314087 + }, + { + "auxiliary_loss_clip": 0.01134805, + "auxiliary_loss_mlp": 0.01026192, + "balance_loss_clip": 1.04295421, + "balance_loss_mlp": 1.01897371, + "epoch": 0.9699993987855468, + "flos": 17931455297280.0, + "grad_norm": 2.5280725916669637, + "language_loss": 0.75695115, + "learning_rate": 9.398132876856201e-09, + "loss": 0.77856117, + "num_input_tokens_seen": 174147040, + "step": 8067, + "time_per_iteration": 2.44856858253479 + }, + { + "auxiliary_loss_clip": 0.01019933, + "auxiliary_loss_mlp": 0.01001354, + "balance_loss_clip": 1.00769925, + "balance_loss_mlp": 1.00036418, + "epoch": 0.9701196416761859, + "flos": 67182186297600.0, + "grad_norm": 0.7725819647919476, + "language_loss": 0.60848546, + "learning_rate": 9.322856029447379e-09, + "loss": 0.62869823, + "num_input_tokens_seen": 174208225, + "step": 8068, + "time_per_iteration": 3.8124983310699463 + }, + { + "auxiliary_loss_clip": 0.01163216, + "auxiliary_loss_mlp": 0.01027093, + "balance_loss_clip": 1.04721701, + "balance_loss_mlp": 1.02021766, + "epoch": 0.970239884566825, + "flos": 24277804012800.0, + "grad_norm": 3.0484449822882316, + "language_loss": 0.80546355, + "learning_rate": 9.247881166178695e-09, + "loss": 0.82736659, + "num_input_tokens_seen": 174226935, + "step": 8069, + "time_per_iteration": 2.439831018447876 + }, + { + "auxiliary_loss_clip": 0.0113341, + "auxiliary_loss_mlp": 0.01025749, + "balance_loss_clip": 1.04554296, + "balance_loss_mlp": 1.01879644, + "epoch": 0.970360127457464, + "flos": 25301689194240.0, + "grad_norm": 2.2817928502613936, + "language_loss": 0.76568025, + "learning_rate": 9.173208298423274e-09, + "loss": 0.78727186, + "num_input_tokens_seen": 174248140, + "step": 8070, + "time_per_iteration": 3.4088246822357178 + }, + { + "auxiliary_loss_clip": 0.01107803, + "auxiliary_loss_mlp": 0.00762062, + "balance_loss_clip": 1.04301524, + "balance_loss_mlp": 1.00040603, + "epoch": 0.9704803703481032, + "flos": 29572473398400.0, + "grad_norm": 1.603832583363965, + "language_loss": 0.76314199, + "learning_rate": 9.09883743750961e-09, + "loss": 0.78184056, + "num_input_tokens_seen": 174271030, + "step": 8071, + "time_per_iteration": 2.653918504714966 + }, + { + "auxiliary_loss_clip": 0.0113509, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.04407382, + "balance_loss_mlp": 1.0167892, + "epoch": 0.9706006132387422, + "flos": 17380046638080.0, + "grad_norm": 1.5501400718443472, + "language_loss": 0.83708215, + "learning_rate": 9.024768594719124e-09, + "loss": 0.85867071, + "num_input_tokens_seen": 174289410, + "step": 8072, + "time_per_iteration": 2.5123648643493652 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.0101999, + "balance_loss_clip": 1.04702282, + "balance_loss_mlp": 1.01304591, + "epoch": 0.9707208561293813, + "flos": 18186421011840.0, + "grad_norm": 2.3703793545808134, + "language_loss": 0.72509736, + "learning_rate": 8.95100178128816e-09, + "loss": 0.74656785, + "num_input_tokens_seen": 174308550, + "step": 8073, + "time_per_iteration": 2.4899332523345947 + }, + { + "auxiliary_loss_clip": 0.01137115, + "auxiliary_loss_mlp": 0.01024758, + "balance_loss_clip": 1.0438652, + "balance_loss_mlp": 1.01708102, + "epoch": 0.9708410990200205, + "flos": 31248388212480.0, + "grad_norm": 2.188654683360268, + "language_loss": 0.69983995, + "learning_rate": 8.877537008407321e-09, + "loss": 0.72145867, + "num_input_tokens_seen": 174328600, + "step": 8074, + "time_per_iteration": 2.5641233921051025 + }, + { + "auxiliary_loss_clip": 0.01140478, + "auxiliary_loss_mlp": 0.01022969, + "balance_loss_clip": 1.04446626, + "balance_loss_mlp": 1.01608467, + "epoch": 0.9709613419106595, + "flos": 30554450386560.0, + "grad_norm": 1.7964615062313898, + "language_loss": 0.68543047, + "learning_rate": 8.804374287221028e-09, + "loss": 0.70706493, + "num_input_tokens_seen": 174349835, + "step": 8075, + "time_per_iteration": 2.560720920562744 + }, + { + "auxiliary_loss_clip": 0.01114021, + "auxiliary_loss_mlp": 0.01023088, + "balance_loss_clip": 1.03740501, + "balance_loss_mlp": 1.01593816, + "epoch": 0.9710815848012986, + "flos": 23730166281600.0, + "grad_norm": 1.621147413992952, + "language_loss": 0.84582198, + "learning_rate": 8.731513628827958e-09, + "loss": 0.8671931, + "num_input_tokens_seen": 174369200, + "step": 8076, + "time_per_iteration": 3.271825075149536 + }, + { + "auxiliary_loss_clip": 0.01152095, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.04534686, + "balance_loss_mlp": 1.0165956, + "epoch": 0.9712018276919377, + "flos": 23761875012480.0, + "grad_norm": 3.96040113086316, + "language_loss": 0.82598138, + "learning_rate": 8.658955044280825e-09, + "loss": 0.84774029, + "num_input_tokens_seen": 174388125, + "step": 8077, + "time_per_iteration": 2.47666335105896 + }, + { + "auxiliary_loss_clip": 0.01150589, + "auxiliary_loss_mlp": 0.01021192, + "balance_loss_clip": 1.04697752, + "balance_loss_mlp": 1.01399779, + "epoch": 0.9713220705825768, + "flos": 23330983461120.0, + "grad_norm": 2.1279930362772497, + "language_loss": 0.77493244, + "learning_rate": 8.586698544587268e-09, + "loss": 0.79665029, + "num_input_tokens_seen": 174409735, + "step": 8078, + "time_per_iteration": 2.4621527194976807 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01025276, + "balance_loss_clip": 1.04271126, + "balance_loss_mlp": 1.01736045, + "epoch": 0.9714423134732159, + "flos": 22200946611840.0, + "grad_norm": 1.942505467808045, + "language_loss": 0.74173582, + "learning_rate": 8.514744140707853e-09, + "loss": 0.76329511, + "num_input_tokens_seen": 174428875, + "step": 8079, + "time_per_iteration": 2.49058198928833 + }, + { + "auxiliary_loss_clip": 0.01161932, + "auxiliary_loss_mlp": 0.01021949, + "balance_loss_clip": 1.04512179, + "balance_loss_mlp": 1.01533628, + "epoch": 0.971562556363855, + "flos": 20229917656320.0, + "grad_norm": 1.6707515383438474, + "language_loss": 0.76542616, + "learning_rate": 8.443091843558515e-09, + "loss": 0.78726494, + "num_input_tokens_seen": 174447960, + "step": 8080, + "time_per_iteration": 2.4033408164978027 + }, + { + "auxiliary_loss_clip": 0.01131959, + "auxiliary_loss_mlp": 0.01020976, + "balance_loss_clip": 1.04291368, + "balance_loss_mlp": 1.01365054, + "epoch": 0.9716827992544941, + "flos": 24970197553920.0, + "grad_norm": 2.096770904796048, + "language_loss": 0.64655209, + "learning_rate": 8.37174166400878e-09, + "loss": 0.66808146, + "num_input_tokens_seen": 174463535, + "step": 8081, + "time_per_iteration": 2.5080575942993164 + }, + { + "auxiliary_loss_clip": 0.01164134, + "auxiliary_loss_mlp": 0.0102404, + "balance_loss_clip": 1.04750323, + "balance_loss_mlp": 1.01714945, + "epoch": 0.9718030421451331, + "flos": 24681476033280.0, + "grad_norm": 2.005648810817171, + "language_loss": 0.84900779, + "learning_rate": 8.300693612881992e-09, + "loss": 0.87088954, + "num_input_tokens_seen": 174483600, + "step": 8082, + "time_per_iteration": 2.4283270835876465 + }, + { + "auxiliary_loss_clip": 0.01148927, + "auxiliary_loss_mlp": 0.0076156, + "balance_loss_clip": 1.0455029, + "balance_loss_mlp": 1.00043702, + "epoch": 0.9719232850357723, + "flos": 22090700793600.0, + "grad_norm": 1.8154833613872918, + "language_loss": 0.81318843, + "learning_rate": 8.22994770095664e-09, + "loss": 0.83229339, + "num_input_tokens_seen": 174502175, + "step": 8083, + "time_per_iteration": 2.4316601753234863 + }, + { + "auxiliary_loss_clip": 0.0113981, + "auxiliary_loss_mlp": 0.01023543, + "balance_loss_clip": 1.04910612, + "balance_loss_mlp": 1.01608944, + "epoch": 0.9720435279264114, + "flos": 23656908493440.0, + "grad_norm": 2.195695438138768, + "language_loss": 0.75365877, + "learning_rate": 8.159503938964585e-09, + "loss": 0.77529228, + "num_input_tokens_seen": 174519495, + "step": 8084, + "time_per_iteration": 2.486135959625244 + }, + { + "auxiliary_loss_clip": 0.01115123, + "auxiliary_loss_mlp": 0.01017586, + "balance_loss_clip": 1.04169464, + "balance_loss_mlp": 1.01093674, + "epoch": 0.9721637708170504, + "flos": 28365910623360.0, + "grad_norm": 1.8123352055315165, + "language_loss": 0.70398134, + "learning_rate": 8.089362337592164e-09, + "loss": 0.72530842, + "num_input_tokens_seen": 174543120, + "step": 8085, + "time_per_iteration": 2.5561342239379883 + }, + { + "auxiliary_loss_clip": 0.0113299, + "auxiliary_loss_mlp": 0.01025978, + "balance_loss_clip": 1.04344547, + "balance_loss_mlp": 1.01916194, + "epoch": 0.9722840137076896, + "flos": 29130807767040.0, + "grad_norm": 1.5550398862774735, + "language_loss": 0.72033215, + "learning_rate": 8.019522907479536e-09, + "loss": 0.74192178, + "num_input_tokens_seen": 174563480, + "step": 8086, + "time_per_iteration": 2.519136428833008 + }, + { + "auxiliary_loss_clip": 0.01154088, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.04679084, + "balance_loss_mlp": 1.01915598, + "epoch": 0.9724042565983286, + "flos": 19243954258560.0, + "grad_norm": 2.362611294194622, + "language_loss": 0.7724455, + "learning_rate": 7.949985659221558e-09, + "loss": 0.79425192, + "num_input_tokens_seen": 174580745, + "step": 8087, + "time_per_iteration": 2.426196336746216 + }, + { + "auxiliary_loss_clip": 0.01140333, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.04531229, + "balance_loss_mlp": 1.01915216, + "epoch": 0.9725244994889677, + "flos": 23039676161280.0, + "grad_norm": 2.0301704482002187, + "language_loss": 0.78899974, + "learning_rate": 7.880750603366904e-09, + "loss": 0.81065953, + "num_input_tokens_seen": 174599615, + "step": 8088, + "time_per_iteration": 2.466291666030884 + }, + { + "auxiliary_loss_clip": 0.01130884, + "auxiliary_loss_mlp": 0.01025933, + "balance_loss_clip": 1.04219651, + "balance_loss_mlp": 1.01794887, + "epoch": 0.9726447423796069, + "flos": 23367468700800.0, + "grad_norm": 1.8461772697387901, + "language_loss": 0.79450363, + "learning_rate": 7.811817750418282e-09, + "loss": 0.81607181, + "num_input_tokens_seen": 174618375, + "step": 8089, + "time_per_iteration": 2.5140490531921387 + }, + { + "auxiliary_loss_clip": 0.0112053, + "auxiliary_loss_mlp": 0.0102358, + "balance_loss_clip": 1.04447043, + "balance_loss_mlp": 1.01606369, + "epoch": 0.9727649852702459, + "flos": 26541648639360.0, + "grad_norm": 1.7699347020096867, + "language_loss": 0.80101824, + "learning_rate": 7.743187110833105e-09, + "loss": 0.82245934, + "num_input_tokens_seen": 174641135, + "step": 8090, + "time_per_iteration": 2.5541646480560303 + }, + { + "auxiliary_loss_clip": 0.01138747, + "auxiliary_loss_mlp": 0.01019088, + "balance_loss_clip": 1.04279661, + "balance_loss_mlp": 1.01243615, + "epoch": 0.972885228160885, + "flos": 20522338277760.0, + "grad_norm": 1.4327316647404809, + "language_loss": 0.80602801, + "learning_rate": 7.674858695022602e-09, + "loss": 0.82760644, + "num_input_tokens_seen": 174659490, + "step": 8091, + "time_per_iteration": 2.4772720336914062 + }, + { + "auxiliary_loss_clip": 0.01167419, + "auxiliary_loss_mlp": 0.01025288, + "balance_loss_clip": 1.04871655, + "balance_loss_mlp": 1.0179683, + "epoch": 0.9730054710515241, + "flos": 17566064196480.0, + "grad_norm": 4.171080513387283, + "language_loss": 0.75368786, + "learning_rate": 7.606832513351591e-09, + "loss": 0.77561498, + "num_input_tokens_seen": 174677440, + "step": 8092, + "time_per_iteration": 2.4062042236328125 + }, + { + "auxiliary_loss_clip": 0.01060855, + "auxiliary_loss_mlp": 0.00752819, + "balance_loss_clip": 1.00727057, + "balance_loss_mlp": 0.99990046, + "epoch": 0.9731257139421632, + "flos": 68972010117120.0, + "grad_norm": 0.8387898086433941, + "language_loss": 0.63925081, + "learning_rate": 7.539108576140264e-09, + "loss": 0.65738755, + "num_input_tokens_seen": 174741550, + "step": 8093, + "time_per_iteration": 3.9284298419952393 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01019434, + "balance_loss_clip": 1.04169524, + "balance_loss_mlp": 1.01318407, + "epoch": 0.9732459568328022, + "flos": 18478841633280.0, + "grad_norm": 7.059130983022169, + "language_loss": 0.70622194, + "learning_rate": 7.471686893661732e-09, + "loss": 0.72749805, + "num_input_tokens_seen": 174759845, + "step": 8094, + "time_per_iteration": 3.3531012535095215 + }, + { + "auxiliary_loss_clip": 0.01133073, + "auxiliary_loss_mlp": 0.01024277, + "balance_loss_clip": 1.04347253, + "balance_loss_mlp": 1.01741314, + "epoch": 0.9733661997234414, + "flos": 20883886623360.0, + "grad_norm": 1.8451418180629455, + "language_loss": 0.64421296, + "learning_rate": 7.4045674761442636e-09, + "loss": 0.6657865, + "num_input_tokens_seen": 174777175, + "step": 8095, + "time_per_iteration": 2.484976291656494 + }, + { + "auxiliary_loss_clip": 0.01163274, + "auxiliary_loss_mlp": 0.00761438, + "balance_loss_clip": 1.0464251, + "balance_loss_mlp": 1.00037682, + "epoch": 0.9734864426140805, + "flos": 23766795175680.0, + "grad_norm": 2.013372456560244, + "language_loss": 0.74053681, + "learning_rate": 7.337750333769488e-09, + "loss": 0.75978392, + "num_input_tokens_seen": 174796980, + "step": 8096, + "time_per_iteration": 3.1700375080108643 + }, + { + "auxiliary_loss_clip": 0.01141496, + "auxiliary_loss_mlp": 0.01023873, + "balance_loss_clip": 1.04051638, + "balance_loss_mlp": 1.01596034, + "epoch": 0.9736066855047195, + "flos": 35042422176000.0, + "grad_norm": 1.8499675575263224, + "language_loss": 0.72811949, + "learning_rate": 7.2712354766737425e-09, + "loss": 0.7497732, + "num_input_tokens_seen": 174817310, + "step": 8097, + "time_per_iteration": 2.590702533721924 + }, + { + "auxiliary_loss_clip": 0.01118651, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.04600024, + "balance_loss_mlp": 1.01694345, + "epoch": 0.9737269283953586, + "flos": 20410620001920.0, + "grad_norm": 1.5692223122004738, + "language_loss": 0.80446506, + "learning_rate": 7.2050229149469565e-09, + "loss": 0.82590234, + "num_input_tokens_seen": 174837320, + "step": 8098, + "time_per_iteration": 2.4967474937438965 + }, + { + "auxiliary_loss_clip": 0.01125675, + "auxiliary_loss_mlp": 0.01024111, + "balance_loss_clip": 1.04091692, + "balance_loss_mlp": 1.01717913, + "epoch": 0.9738471712859977, + "flos": 28911680847360.0, + "grad_norm": 1.9796723327975578, + "language_loss": 0.6377998, + "learning_rate": 7.139112658633984e-09, + "loss": 0.6592977, + "num_input_tokens_seen": 174857470, + "step": 8099, + "time_per_iteration": 2.5598456859588623 + }, + { + "auxiliary_loss_clip": 0.01122661, + "auxiliary_loss_mlp": 0.01022095, + "balance_loss_clip": 1.04465377, + "balance_loss_mlp": 1.01537728, + "epoch": 0.9739674141766368, + "flos": 27782326356480.0, + "grad_norm": 1.9990668795700717, + "language_loss": 0.70322263, + "learning_rate": 7.073504717733048e-09, + "loss": 0.72467023, + "num_input_tokens_seen": 174877035, + "step": 8100, + "time_per_iteration": 2.5454845428466797 + }, + { + "auxiliary_loss_clip": 0.0101923, + "auxiliary_loss_mlp": 0.01002455, + "balance_loss_clip": 1.00994813, + "balance_loss_mlp": 1.00165641, + "epoch": 0.9740876570672758, + "flos": 68863057188480.0, + "grad_norm": 0.7397545835754016, + "language_loss": 0.57183361, + "learning_rate": 7.008199102196855e-09, + "loss": 0.59205049, + "num_input_tokens_seen": 174938460, + "step": 8101, + "time_per_iteration": 3.064176321029663 + }, + { + "auxiliary_loss_clip": 0.01034016, + "auxiliary_loss_mlp": 0.01002128, + "balance_loss_clip": 1.00816333, + "balance_loss_mlp": 1.00116813, + "epoch": 0.974207899957915, + "flos": 58236622646400.0, + "grad_norm": 0.8218522176226631, + "language_loss": 0.58975828, + "learning_rate": 6.9431958219321464e-09, + "loss": 0.6101197, + "num_input_tokens_seen": 174994625, + "step": 8102, + "time_per_iteration": 3.7765049934387207 + }, + { + "auxiliary_loss_clip": 0.01136935, + "auxiliary_loss_mlp": 0.01024942, + "balance_loss_clip": 1.04401708, + "balance_loss_mlp": 1.01757753, + "epoch": 0.9743281428485541, + "flos": 22600057605120.0, + "grad_norm": 1.492971481254201, + "language_loss": 0.77770996, + "learning_rate": 6.878494886800146e-09, + "loss": 0.79932868, + "num_input_tokens_seen": 175015400, + "step": 8103, + "time_per_iteration": 2.4827070236206055 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01021934, + "balance_loss_clip": 1.04606843, + "balance_loss_mlp": 1.01478493, + "epoch": 0.9744483857391931, + "flos": 20008815488640.0, + "grad_norm": 2.1230928173914614, + "language_loss": 0.76486599, + "learning_rate": 6.814096306615669e-09, + "loss": 0.78647017, + "num_input_tokens_seen": 175033540, + "step": 8104, + "time_per_iteration": 2.4556937217712402 + }, + { + "auxiliary_loss_clip": 0.01142051, + "auxiliary_loss_mlp": 0.01024623, + "balance_loss_clip": 1.04281402, + "balance_loss_mlp": 1.01721144, + "epoch": 0.9745686286298323, + "flos": 17675268520320.0, + "grad_norm": 2.080499863953652, + "language_loss": 0.65418673, + "learning_rate": 6.750000091148011e-09, + "loss": 0.67585349, + "num_input_tokens_seen": 175050835, + "step": 8105, + "time_per_iteration": 2.442660331726074 + }, + { + "auxiliary_loss_clip": 0.01165537, + "auxiliary_loss_mlp": 0.01024805, + "balance_loss_clip": 1.04791081, + "balance_loss_mlp": 1.01741982, + "epoch": 0.9746888715204713, + "flos": 29460252332160.0, + "grad_norm": 3.059365685756993, + "language_loss": 0.72483301, + "learning_rate": 6.686206250120729e-09, + "loss": 0.74673647, + "num_input_tokens_seen": 175072330, + "step": 8106, + "time_per_iteration": 2.4667716026306152 + }, + { + "auxiliary_loss_clip": 0.01128273, + "auxiliary_loss_mlp": 0.0101988, + "balance_loss_clip": 1.04172897, + "balance_loss_mlp": 1.01302528, + "epoch": 0.9748091144111104, + "flos": 18479308510080.0, + "grad_norm": 2.012738753033083, + "language_loss": 0.74808598, + "learning_rate": 6.622714793210749e-09, + "loss": 0.76956749, + "num_input_tokens_seen": 175091250, + "step": 8107, + "time_per_iteration": 2.486872434616089 + }, + { + "auxiliary_loss_clip": 0.01165791, + "auxiliary_loss_mlp": 0.01021233, + "balance_loss_clip": 1.04698598, + "balance_loss_mlp": 1.01446748, + "epoch": 0.9749293573017496, + "flos": 20665154753280.0, + "grad_norm": 1.6697969042145406, + "language_loss": 0.79010618, + "learning_rate": 6.559525730050364e-09, + "loss": 0.81197643, + "num_input_tokens_seen": 175111350, + "step": 8108, + "time_per_iteration": 2.4031989574432373 + }, + { + "auxiliary_loss_clip": 0.01126333, + "auxiliary_loss_mlp": 0.01021914, + "balance_loss_clip": 1.04463696, + "balance_loss_mlp": 1.01506567, + "epoch": 0.9750496001923886, + "flos": 18478590238080.0, + "grad_norm": 2.100066966205352, + "language_loss": 0.75925088, + "learning_rate": 6.496639070224574e-09, + "loss": 0.78073335, + "num_input_tokens_seen": 175129835, + "step": 8109, + "time_per_iteration": 2.4959845542907715 + }, + { + "auxiliary_loss_clip": 0.01154928, + "auxiliary_loss_mlp": 0.01022647, + "balance_loss_clip": 1.04728138, + "balance_loss_mlp": 1.01635861, + "epoch": 0.9751698430830277, + "flos": 19572967860480.0, + "grad_norm": 2.285181785990845, + "language_loss": 0.84143889, + "learning_rate": 6.4340548232739714e-09, + "loss": 0.86321467, + "num_input_tokens_seen": 175146035, + "step": 8110, + "time_per_iteration": 2.4160165786743164 + }, + { + "auxiliary_loss_clip": 0.01127199, + "auxiliary_loss_mlp": 0.01023324, + "balance_loss_clip": 1.0425216, + "balance_loss_mlp": 1.01586461, + "epoch": 0.9752900859736668, + "flos": 23550325862400.0, + "grad_norm": 2.7302618905670695, + "language_loss": 0.79359519, + "learning_rate": 6.371772998692071e-09, + "loss": 0.81510043, + "num_input_tokens_seen": 175165290, + "step": 8111, + "time_per_iteration": 2.5457630157470703 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01020181, + "balance_loss_clip": 1.04192674, + "balance_loss_mlp": 1.01324296, + "epoch": 0.9754103288643059, + "flos": 20303211358080.0, + "grad_norm": 6.202481461481704, + "language_loss": 0.64976633, + "learning_rate": 6.309793605927094e-09, + "loss": 0.67124832, + "num_input_tokens_seen": 175183610, + "step": 8112, + "time_per_iteration": 2.5083487033843994 + }, + { + "auxiliary_loss_clip": 0.01141585, + "auxiliary_loss_mlp": 0.01021796, + "balance_loss_clip": 1.04339075, + "balance_loss_mlp": 1.0145781, + "epoch": 0.975530571754945, + "flos": 19350680544000.0, + "grad_norm": 1.6971518476172494, + "language_loss": 0.80095458, + "learning_rate": 6.248116654381297e-09, + "loss": 0.82258844, + "num_input_tokens_seen": 175202080, + "step": 8113, + "time_per_iteration": 2.4608726501464844 + }, + { + "auxiliary_loss_clip": 0.01137997, + "auxiliary_loss_mlp": 0.01022572, + "balance_loss_clip": 1.04163122, + "balance_loss_mlp": 1.01568747, + "epoch": 0.9756508146455841, + "flos": 23583399310080.0, + "grad_norm": 1.669579803985765, + "language_loss": 0.72641063, + "learning_rate": 6.186742153410751e-09, + "loss": 0.74801636, + "num_input_tokens_seen": 175221575, + "step": 8114, + "time_per_iteration": 2.497535228729248 + }, + { + "auxiliary_loss_clip": 0.01136944, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.04501879, + "balance_loss_mlp": 1.02047586, + "epoch": 0.9757710575362232, + "flos": 22966921163520.0, + "grad_norm": 2.371762953044903, + "language_loss": 0.87399918, + "learning_rate": 6.125670112326453e-09, + "loss": 0.89565158, + "num_input_tokens_seen": 175240835, + "step": 8115, + "time_per_iteration": 2.492361068725586 + }, + { + "auxiliary_loss_clip": 0.01150661, + "auxiliary_loss_mlp": 0.01023932, + "balance_loss_clip": 1.04295027, + "balance_loss_mlp": 1.01676512, + "epoch": 0.9758913004268622, + "flos": 27966009530880.0, + "grad_norm": 1.6151264758519213, + "language_loss": 0.70200801, + "learning_rate": 6.064900540392548e-09, + "loss": 0.72375393, + "num_input_tokens_seen": 175262930, + "step": 8116, + "time_per_iteration": 2.5042977333068848 + }, + { + "auxiliary_loss_clip": 0.01131119, + "auxiliary_loss_mlp": 0.0102034, + "balance_loss_clip": 1.0444541, + "balance_loss_mlp": 1.0135746, + "epoch": 0.9760115433175014, + "flos": 22200156512640.0, + "grad_norm": 1.9972200295645908, + "language_loss": 0.78850317, + "learning_rate": 6.0044334468278835e-09, + "loss": 0.81001776, + "num_input_tokens_seen": 175282275, + "step": 8117, + "time_per_iteration": 2.4768154621124268 + }, + { + "auxiliary_loss_clip": 0.01112618, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.04240966, + "balance_loss_mlp": 1.01954341, + "epoch": 0.9761317862081405, + "flos": 26250736389120.0, + "grad_norm": 1.7051864042213905, + "language_loss": 0.71447563, + "learning_rate": 5.944268840805345e-09, + "loss": 0.73587358, + "num_input_tokens_seen": 175303020, + "step": 8118, + "time_per_iteration": 2.5904552936553955 + }, + { + "auxiliary_loss_clip": 0.01114685, + "auxiliary_loss_mlp": 0.01022792, + "balance_loss_clip": 1.03993487, + "balance_loss_mlp": 1.01632166, + "epoch": 0.9762520290987795, + "flos": 26575440359040.0, + "grad_norm": 3.337457256799733, + "language_loss": 0.63808692, + "learning_rate": 5.88440673145163e-09, + "loss": 0.65946174, + "num_input_tokens_seen": 175324070, + "step": 8119, + "time_per_iteration": 3.382863759994507 + }, + { + "auxiliary_loss_clip": 0.01151053, + "auxiliary_loss_mlp": 0.01024807, + "balance_loss_clip": 1.04823482, + "balance_loss_mlp": 1.01774132, + "epoch": 0.9763722719894187, + "flos": 18005036307840.0, + "grad_norm": 2.106207766736534, + "language_loss": 0.82200015, + "learning_rate": 5.824847127848142e-09, + "loss": 0.84375882, + "num_input_tokens_seen": 175342595, + "step": 8120, + "time_per_iteration": 2.4163408279418945 + }, + { + "auxiliary_loss_clip": 0.01111372, + "auxiliary_loss_mlp": 0.01020143, + "balance_loss_clip": 1.04342175, + "balance_loss_mlp": 1.01313317, + "epoch": 0.9764925148800577, + "flos": 22455660931200.0, + "grad_norm": 2.017126212230986, + "language_loss": 0.7852121, + "learning_rate": 5.765590039029433e-09, + "loss": 0.80652726, + "num_input_tokens_seen": 175361915, + "step": 8121, + "time_per_iteration": 3.365196704864502 + }, + { + "auxiliary_loss_clip": 0.01163146, + "auxiliary_loss_mlp": 0.01025189, + "balance_loss_clip": 1.04720449, + "balance_loss_mlp": 1.01806903, + "epoch": 0.9766127577706968, + "flos": 36757084786560.0, + "grad_norm": 1.5228147647626102, + "language_loss": 0.7102589, + "learning_rate": 5.706635473985422e-09, + "loss": 0.73214221, + "num_input_tokens_seen": 175385785, + "step": 8122, + "time_per_iteration": 2.569197416305542 + }, + { + "auxiliary_loss_clip": 0.01147091, + "auxiliary_loss_mlp": 0.01023305, + "balance_loss_clip": 1.04369509, + "balance_loss_mlp": 1.01640034, + "epoch": 0.976733000661336, + "flos": 22309971367680.0, + "grad_norm": 1.778794603266535, + "language_loss": 0.85213721, + "learning_rate": 5.6479834416591764e-09, + "loss": 0.87384117, + "num_input_tokens_seen": 175405145, + "step": 8123, + "time_per_iteration": 3.242032051086426 + }, + { + "auxiliary_loss_clip": 0.01151754, + "auxiliary_loss_mlp": 0.00762177, + "balance_loss_clip": 1.04720449, + "balance_loss_mlp": 1.00044072, + "epoch": 0.976853243551975, + "flos": 25810938264960.0, + "grad_norm": 4.051246936015088, + "language_loss": 0.68507946, + "learning_rate": 5.589633950947803e-09, + "loss": 0.70421875, + "num_input_tokens_seen": 175422645, + "step": 8124, + "time_per_iteration": 2.4661929607391357 + }, + { + "auxiliary_loss_clip": 0.01136933, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.04482484, + "balance_loss_mlp": 1.01977992, + "epoch": 0.9769734864426141, + "flos": 21397445326080.0, + "grad_norm": 2.0128132042892366, + "language_loss": 0.69590014, + "learning_rate": 5.5315870107035535e-09, + "loss": 0.71754169, + "num_input_tokens_seen": 175440695, + "step": 8125, + "time_per_iteration": 2.4700679779052734 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.04820156, + "balance_loss_mlp": 1.01805735, + "epoch": 0.9770937293332532, + "flos": 13990977584640.0, + "grad_norm": 1.7515072225288109, + "language_loss": 0.78687161, + "learning_rate": 5.473842629731607e-09, + "loss": 0.80848789, + "num_input_tokens_seen": 175459195, + "step": 8126, + "time_per_iteration": 2.467736005783081 + }, + { + "auxiliary_loss_clip": 0.01144096, + "auxiliary_loss_mlp": 0.0076178, + "balance_loss_clip": 1.04396796, + "balance_loss_mlp": 1.00050998, + "epoch": 0.9772139722238923, + "flos": 17931994001280.0, + "grad_norm": 1.9751408344668755, + "language_loss": 0.7812764, + "learning_rate": 5.416400816792066e-09, + "loss": 0.80033517, + "num_input_tokens_seen": 175476710, + "step": 8127, + "time_per_iteration": 2.4525721073150635 + }, + { + "auxiliary_loss_clip": 0.01162821, + "auxiliary_loss_mlp": 0.01020739, + "balance_loss_clip": 1.04615641, + "balance_loss_mlp": 1.01363122, + "epoch": 0.9773342151145313, + "flos": 20446171488000.0, + "grad_norm": 2.442607377215319, + "language_loss": 0.78262478, + "learning_rate": 5.359261580598407e-09, + "loss": 0.80446041, + "num_input_tokens_seen": 175492550, + "step": 8128, + "time_per_iteration": 2.3831400871276855 + }, + { + "auxiliary_loss_clip": 0.01154127, + "auxiliary_loss_mlp": 0.01023394, + "balance_loss_clip": 1.04757047, + "balance_loss_mlp": 1.01586878, + "epoch": 0.9774544580051704, + "flos": 11837306949120.0, + "grad_norm": 2.3653080014836827, + "language_loss": 0.77689081, + "learning_rate": 5.302424929819027e-09, + "loss": 0.798666, + "num_input_tokens_seen": 175506560, + "step": 8129, + "time_per_iteration": 3.1712870597839355 + }, + { + "auxiliary_loss_clip": 0.01154645, + "auxiliary_loss_mlp": 0.01024732, + "balance_loss_clip": 1.04369283, + "balance_loss_mlp": 1.0176394, + "epoch": 0.9775747008958096, + "flos": 13479932833920.0, + "grad_norm": 2.408888524014105, + "language_loss": 0.73017269, + "learning_rate": 5.24589087307592e-09, + "loss": 0.75196648, + "num_input_tokens_seen": 175524180, + "step": 8130, + "time_per_iteration": 2.410435914993286 + }, + { + "auxiliary_loss_clip": 0.01165728, + "auxiliary_loss_mlp": 0.01025072, + "balance_loss_clip": 1.04668736, + "balance_loss_mlp": 1.01816738, + "epoch": 0.9776949437864486, + "flos": 59532314042880.0, + "grad_norm": 2.011909747498914, + "language_loss": 0.64827871, + "learning_rate": 5.189659418944891e-09, + "loss": 0.67018676, + "num_input_tokens_seen": 175554355, + "step": 8131, + "time_per_iteration": 2.8076958656311035 + }, + { + "auxiliary_loss_clip": 0.01165473, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.04889047, + "balance_loss_mlp": 1.02024364, + "epoch": 0.9778151866770877, + "flos": 21178605715200.0, + "grad_norm": 2.1845730907548653, + "language_loss": 0.78398108, + "learning_rate": 5.133730575956674e-09, + "loss": 0.80590552, + "num_input_tokens_seen": 175574025, + "step": 8132, + "time_per_iteration": 2.3939788341522217 + }, + { + "auxiliary_loss_clip": 0.01139669, + "auxiliary_loss_mlp": 0.01019615, + "balance_loss_clip": 1.04460788, + "balance_loss_mlp": 1.01264966, + "epoch": 0.9779354295677268, + "flos": 20886795624960.0, + "grad_norm": 1.8914406519125866, + "language_loss": 0.7174443, + "learning_rate": 5.0781043525953696e-09, + "loss": 0.73903716, + "num_input_tokens_seen": 175592090, + "step": 8133, + "time_per_iteration": 2.4702401161193848 + }, + { + "auxiliary_loss_clip": 0.01131888, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.04608107, + "balance_loss_mlp": 1.01998973, + "epoch": 0.9780556724583659, + "flos": 23440618748160.0, + "grad_norm": 1.7668006370257898, + "language_loss": 0.73815346, + "learning_rate": 5.0227807572995605e-09, + "loss": 0.75974387, + "num_input_tokens_seen": 175614065, + "step": 8134, + "time_per_iteration": 2.4858009815216064 + }, + { + "auxiliary_loss_clip": 0.01138725, + "auxiliary_loss_mlp": 0.01019099, + "balance_loss_clip": 1.04271317, + "balance_loss_mlp": 1.01222396, + "epoch": 0.9781759153490049, + "flos": 20923244951040.0, + "grad_norm": 2.150014382063035, + "language_loss": 0.66863084, + "learning_rate": 4.967759798461646e-09, + "loss": 0.69020909, + "num_input_tokens_seen": 175632410, + "step": 8135, + "time_per_iteration": 2.4722399711608887 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01021028, + "balance_loss_clip": 1.0472796, + "balance_loss_mlp": 1.01430786, + "epoch": 0.9782961582396441, + "flos": 28293191539200.0, + "grad_norm": 1.9720769277041839, + "language_loss": 0.74722046, + "learning_rate": 4.913041484428282e-09, + "loss": 0.76906079, + "num_input_tokens_seen": 175652885, + "step": 8136, + "time_per_iteration": 2.4538471698760986 + }, + { + "auxiliary_loss_clip": 0.011533, + "auxiliary_loss_mlp": 0.01021275, + "balance_loss_clip": 1.04742908, + "balance_loss_mlp": 1.01448333, + "epoch": 0.9784164011302832, + "flos": 25552955808000.0, + "grad_norm": 1.8485643289530442, + "language_loss": 0.74049205, + "learning_rate": 4.858625823500384e-09, + "loss": 0.76223785, + "num_input_tokens_seen": 175670585, + "step": 8137, + "time_per_iteration": 2.462751626968384 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.01025347, + "balance_loss_clip": 1.04546785, + "balance_loss_mlp": 1.01767635, + "epoch": 0.9785366440209222, + "flos": 29965945956480.0, + "grad_norm": 1.8551775818326384, + "language_loss": 0.73223931, + "learning_rate": 4.80451282393246e-09, + "loss": 0.75402713, + "num_input_tokens_seen": 175690570, + "step": 8138, + "time_per_iteration": 2.526431083679199 + }, + { + "auxiliary_loss_clip": 0.01135768, + "auxiliary_loss_mlp": 0.01019203, + "balance_loss_clip": 1.04343545, + "balance_loss_mlp": 1.01248288, + "epoch": 0.9786568869115614, + "flos": 32343591847680.0, + "grad_norm": 1.8344639634645163, + "language_loss": 0.6755361, + "learning_rate": 4.750702493933722e-09, + "loss": 0.69708586, + "num_input_tokens_seen": 175710455, + "step": 8139, + "time_per_iteration": 2.5411572456359863 + }, + { + "auxiliary_loss_clip": 0.0113968, + "auxiliary_loss_mlp": 0.0076186, + "balance_loss_clip": 1.04781199, + "balance_loss_mlp": 1.0004015, + "epoch": 0.9787771298022004, + "flos": 23331414424320.0, + "grad_norm": 1.8296538899820265, + "language_loss": 0.85251313, + "learning_rate": 4.697194841666974e-09, + "loss": 0.87152851, + "num_input_tokens_seen": 175729380, + "step": 8140, + "time_per_iteration": 2.482389211654663 + }, + { + "auxiliary_loss_clip": 0.01153549, + "auxiliary_loss_mlp": 0.01026866, + "balance_loss_clip": 1.04599369, + "balance_loss_mlp": 1.01856875, + "epoch": 0.9788973726928395, + "flos": 21468548298240.0, + "grad_norm": 2.3121388636969593, + "language_loss": 0.81939673, + "learning_rate": 4.6439898752492764e-09, + "loss": 0.84120089, + "num_input_tokens_seen": 175749520, + "step": 8141, + "time_per_iteration": 2.4324891567230225 + }, + { + "auxiliary_loss_clip": 0.01052235, + "auxiliary_loss_mlp": 0.00752834, + "balance_loss_clip": 1.00818765, + "balance_loss_mlp": 0.99988747, + "epoch": 0.9790176155834787, + "flos": 68897459439360.0, + "grad_norm": 0.747806976167841, + "language_loss": 0.63677412, + "learning_rate": 4.591087602751731e-09, + "loss": 0.65482479, + "num_input_tokens_seen": 175811380, + "step": 8142, + "time_per_iteration": 3.103113889694214 + }, + { + "auxiliary_loss_clip": 0.01151411, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.04546499, + "balance_loss_mlp": 1.0182898, + "epoch": 0.9791378584741177, + "flos": 21430877909760.0, + "grad_norm": 1.6713511878043568, + "language_loss": 0.71668696, + "learning_rate": 4.538488032199916e-09, + "loss": 0.73845243, + "num_input_tokens_seen": 175829480, + "step": 8143, + "time_per_iteration": 2.4319615364074707 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.04391646, + "balance_loss_mlp": 1.02075255, + "epoch": 0.9792581013647568, + "flos": 20153032594560.0, + "grad_norm": 1.9393341764512604, + "language_loss": 0.68740761, + "learning_rate": 4.486191171572784e-09, + "loss": 0.70922685, + "num_input_tokens_seen": 175846750, + "step": 8144, + "time_per_iteration": 2.4169678688049316 + }, + { + "auxiliary_loss_clip": 0.01153707, + "auxiliary_loss_mlp": 0.01021738, + "balance_loss_clip": 1.04774129, + "balance_loss_mlp": 1.01483262, + "epoch": 0.9793783442553959, + "flos": 23728191033600.0, + "grad_norm": 1.6116398258788225, + "language_loss": 0.77950466, + "learning_rate": 4.434197028803766e-09, + "loss": 0.8012591, + "num_input_tokens_seen": 175865975, + "step": 8145, + "time_per_iteration": 2.491436243057251 + }, + { + "auxiliary_loss_clip": 0.0112927, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.04270816, + "balance_loss_mlp": 1.01811552, + "epoch": 0.979498587146035, + "flos": 23038742407680.0, + "grad_norm": 2.120129183388703, + "language_loss": 0.81977075, + "learning_rate": 4.3825056117805514e-09, + "loss": 0.84132421, + "num_input_tokens_seen": 175881860, + "step": 8146, + "time_per_iteration": 3.350921630859375 + }, + { + "auxiliary_loss_clip": 0.01164236, + "auxiliary_loss_mlp": 0.01017539, + "balance_loss_clip": 1.04483521, + "balance_loss_mlp": 1.01034176, + "epoch": 0.979618830036674, + "flos": 14318841951360.0, + "grad_norm": 3.371658584094465, + "language_loss": 0.79727459, + "learning_rate": 4.331116928344425e-09, + "loss": 0.81909239, + "num_input_tokens_seen": 175898175, + "step": 8147, + "time_per_iteration": 2.37263560295105 + }, + { + "auxiliary_loss_clip": 0.01140426, + "auxiliary_loss_mlp": 0.00761766, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.00039959, + "epoch": 0.9797390729273132, + "flos": 16727514215040.0, + "grad_norm": 2.061560084937971, + "language_loss": 0.62689418, + "learning_rate": 4.28003098629115e-09, + "loss": 0.6459161, + "num_input_tokens_seen": 175914310, + "step": 8148, + "time_per_iteration": 3.3335866928100586 + }, + { + "auxiliary_loss_clip": 0.01120241, + "auxiliary_loss_mlp": 0.01018853, + "balance_loss_clip": 1.03794074, + "balance_loss_mlp": 1.01156902, + "epoch": 0.9798593158179523, + "flos": 24532661986560.0, + "grad_norm": 1.8677481259793705, + "language_loss": 0.78492945, + "learning_rate": 4.229247793370305e-09, + "loss": 0.80632043, + "num_input_tokens_seen": 175933435, + "step": 8149, + "time_per_iteration": 2.5447299480438232 + }, + { + "auxiliary_loss_clip": 0.01168353, + "auxiliary_loss_mlp": 0.01023926, + "balance_loss_clip": 1.04859984, + "balance_loss_mlp": 1.01657724, + "epoch": 0.9799795587085913, + "flos": 27308808339840.0, + "grad_norm": 1.5736061605973455, + "language_loss": 0.70442384, + "learning_rate": 4.178767357285951e-09, + "loss": 0.72634661, + "num_input_tokens_seen": 175955065, + "step": 8150, + "time_per_iteration": 3.272921323776245 + }, + { + "auxiliary_loss_clip": 0.01151484, + "auxiliary_loss_mlp": 0.00761873, + "balance_loss_clip": 1.04627144, + "balance_loss_mlp": 1.00048232, + "epoch": 0.9800998015992305, + "flos": 26286575184000.0, + "grad_norm": 1.8535681547593335, + "language_loss": 0.71334445, + "learning_rate": 4.128589685695516e-09, + "loss": 0.73247802, + "num_input_tokens_seen": 175975490, + "step": 8151, + "time_per_iteration": 2.495699167251587 + }, + { + "auxiliary_loss_clip": 0.01164658, + "auxiliary_loss_mlp": 0.01023643, + "balance_loss_clip": 1.04765797, + "balance_loss_mlp": 1.01717293, + "epoch": 0.9802200444898695, + "flos": 16723635546240.0, + "grad_norm": 1.9045521649027974, + "language_loss": 0.84778726, + "learning_rate": 4.078714786211135e-09, + "loss": 0.86967027, + "num_input_tokens_seen": 175991340, + "step": 8152, + "time_per_iteration": 2.3678603172302246 + }, + { + "auxiliary_loss_clip": 0.01147414, + "auxiliary_loss_mlp": 0.01019576, + "balance_loss_clip": 1.04514623, + "balance_loss_mlp": 1.01288581, + "epoch": 0.9803402873805086, + "flos": 24900459298560.0, + "grad_norm": 1.6416467066632476, + "language_loss": 0.76966894, + "learning_rate": 4.029142666398977e-09, + "loss": 0.79133886, + "num_input_tokens_seen": 176011505, + "step": 8153, + "time_per_iteration": 2.4981791973114014 + }, + { + "auxiliary_loss_clip": 0.01163146, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.04716539, + "balance_loss_mlp": 1.01903391, + "epoch": 0.9804605302711478, + "flos": 22564937082240.0, + "grad_norm": 1.767689089059547, + "language_loss": 0.80228394, + "learning_rate": 3.979873333778805e-09, + "loss": 0.8241725, + "num_input_tokens_seen": 176029680, + "step": 8154, + "time_per_iteration": 2.404353141784668 + }, + { + "auxiliary_loss_clip": 0.01141301, + "auxiliary_loss_mlp": 0.01025651, + "balance_loss_clip": 1.0459156, + "balance_loss_mlp": 1.01857567, + "epoch": 0.9805807731617868, + "flos": 38905368382080.0, + "grad_norm": 1.803885339464823, + "language_loss": 0.73662245, + "learning_rate": 3.930906795824862e-09, + "loss": 0.75829196, + "num_input_tokens_seen": 176050355, + "step": 8155, + "time_per_iteration": 2.6353776454925537 + }, + { + "auxiliary_loss_clip": 0.0114837, + "auxiliary_loss_mlp": 0.01019828, + "balance_loss_clip": 1.04474437, + "balance_loss_mlp": 1.0128839, + "epoch": 0.9807010160524259, + "flos": 17821999578240.0, + "grad_norm": 2.0487655471343174, + "language_loss": 0.76738417, + "learning_rate": 3.882243059965207e-09, + "loss": 0.78906608, + "num_input_tokens_seen": 176068070, + "step": 8156, + "time_per_iteration": 3.191441297531128 + }, + { + "auxiliary_loss_clip": 0.0114346, + "auxiliary_loss_mlp": 0.01021056, + "balance_loss_clip": 1.04243064, + "balance_loss_mlp": 1.01410329, + "epoch": 0.980821258943065, + "flos": 13552975140480.0, + "grad_norm": 2.508938664063899, + "language_loss": 0.66036391, + "learning_rate": 3.833882133582156e-09, + "loss": 0.6820091, + "num_input_tokens_seen": 176083730, + "step": 8157, + "time_per_iteration": 2.411853790283203 + }, + { + "auxiliary_loss_clip": 0.01152159, + "auxiliary_loss_mlp": 0.01021613, + "balance_loss_clip": 1.04498196, + "balance_loss_mlp": 1.01479387, + "epoch": 0.9809415018337041, + "flos": 21689794120320.0, + "grad_norm": 1.8332722094206069, + "language_loss": 0.78118157, + "learning_rate": 3.785824024012285e-09, + "loss": 0.80291933, + "num_input_tokens_seen": 176102730, + "step": 8158, + "time_per_iteration": 2.4413275718688965 + }, + { + "auxiliary_loss_clip": 0.01129618, + "auxiliary_loss_mlp": 0.01024136, + "balance_loss_clip": 1.0465486, + "balance_loss_mlp": 1.01722431, + "epoch": 0.9810617447243432, + "flos": 23294857357440.0, + "grad_norm": 1.8396009466613221, + "language_loss": 0.78398681, + "learning_rate": 3.738068738545541e-09, + "loss": 0.80552435, + "num_input_tokens_seen": 176121815, + "step": 8159, + "time_per_iteration": 2.4800519943237305 + }, + { + "auxiliary_loss_clip": 0.01155248, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0465827, + "balance_loss_mlp": 1.02221584, + "epoch": 0.9811819876149822, + "flos": 18332038748160.0, + "grad_norm": 3.4786211851779782, + "language_loss": 0.7845642, + "learning_rate": 3.6906162844265733e-09, + "loss": 0.80640912, + "num_input_tokens_seen": 176138900, + "step": 8160, + "time_per_iteration": 2.401216506958008 + }, + { + "auxiliary_loss_clip": 0.01130173, + "auxiliary_loss_mlp": 0.01026881, + "balance_loss_clip": 1.04174781, + "balance_loss_mlp": 1.01934052, + "epoch": 0.9813022305056214, + "flos": 22601961025920.0, + "grad_norm": 1.9317683172812767, + "language_loss": 0.70888704, + "learning_rate": 3.643466668853845e-09, + "loss": 0.73045754, + "num_input_tokens_seen": 176156925, + "step": 8161, + "time_per_iteration": 2.465182065963745 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01019916, + "balance_loss_clip": 1.04395604, + "balance_loss_mlp": 1.01228404, + "epoch": 0.9814224733962604, + "flos": 25413335642880.0, + "grad_norm": 4.715763493552389, + "language_loss": 0.75653255, + "learning_rate": 3.59661989898008e-09, + "loss": 0.77811265, + "num_input_tokens_seen": 176177980, + "step": 8162, + "time_per_iteration": 2.513110637664795 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01024265, + "balance_loss_clip": 1.04372442, + "balance_loss_mlp": 1.01737475, + "epoch": 0.9815427162868995, + "flos": 25007185584000.0, + "grad_norm": 1.6625366758979532, + "language_loss": 0.76566422, + "learning_rate": 3.5500759819115934e-09, + "loss": 0.78708053, + "num_input_tokens_seen": 176198345, + "step": 8163, + "time_per_iteration": 2.5160861015319824 + }, + { + "auxiliary_loss_clip": 0.01166516, + "auxiliary_loss_mlp": 0.01021826, + "balance_loss_clip": 1.04857266, + "balance_loss_mlp": 1.01467061, + "epoch": 0.9816629591775387, + "flos": 20662604887680.0, + "grad_norm": 2.4644786885744705, + "language_loss": 0.80745482, + "learning_rate": 3.5038349247094034e-09, + "loss": 0.82933831, + "num_input_tokens_seen": 176215605, + "step": 8164, + "time_per_iteration": 2.3838396072387695 + }, + { + "auxiliary_loss_clip": 0.01134614, + "auxiliary_loss_mlp": 0.0102223, + "balance_loss_clip": 1.0405457, + "balance_loss_mlp": 1.01488686, + "epoch": 0.9817832020681777, + "flos": 17712220636800.0, + "grad_norm": 2.0773076081398094, + "language_loss": 0.77310395, + "learning_rate": 3.4578967343878994e-09, + "loss": 0.79467249, + "num_input_tokens_seen": 176231810, + "step": 8165, + "time_per_iteration": 2.4234228134155273 + }, + { + "auxiliary_loss_clip": 0.01134869, + "auxiliary_loss_mlp": 0.01023098, + "balance_loss_clip": 1.04334927, + "balance_loss_mlp": 1.01638103, + "epoch": 0.9819034449588168, + "flos": 22530032040960.0, + "grad_norm": 1.8230582597765592, + "language_loss": 0.80955255, + "learning_rate": 3.4122614179161733e-09, + "loss": 0.83113223, + "num_input_tokens_seen": 176251770, + "step": 8166, + "time_per_iteration": 2.4711897373199463 + }, + { + "auxiliary_loss_clip": 0.01113284, + "auxiliary_loss_mlp": 0.01021099, + "balance_loss_clip": 1.04131031, + "balance_loss_mlp": 1.01410198, + "epoch": 0.9820236878494559, + "flos": 20011221699840.0, + "grad_norm": 1.7171763175084334, + "language_loss": 0.78283048, + "learning_rate": 3.36692898221691e-09, + "loss": 0.8041743, + "num_input_tokens_seen": 176270135, + "step": 8167, + "time_per_iteration": 2.4911482334136963 + }, + { + "auxiliary_loss_clip": 0.01151389, + "auxiliary_loss_mlp": 0.01023163, + "balance_loss_clip": 1.0445013, + "balance_loss_mlp": 1.01653206, + "epoch": 0.982143930740095, + "flos": 18807316531200.0, + "grad_norm": 1.92690498703883, + "language_loss": 0.73083019, + "learning_rate": 3.3218994341668305e-09, + "loss": 0.75257564, + "num_input_tokens_seen": 176289065, + "step": 8168, + "time_per_iteration": 2.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01164056, + "auxiliary_loss_mlp": 0.01021881, + "balance_loss_clip": 1.04855669, + "balance_loss_mlp": 1.01537204, + "epoch": 0.982264173630734, + "flos": 26578026138240.0, + "grad_norm": 1.5357757619413042, + "language_loss": 0.75578237, + "learning_rate": 3.2771727805971373e-09, + "loss": 0.77764171, + "num_input_tokens_seen": 176310450, + "step": 8169, + "time_per_iteration": 2.4397590160369873 + }, + { + "auxiliary_loss_clip": 0.01102818, + "auxiliary_loss_mlp": 0.01020651, + "balance_loss_clip": 1.03774548, + "balance_loss_mlp": 1.0137248, + "epoch": 0.9823844165213732, + "flos": 22014462176640.0, + "grad_norm": 1.8060366368054204, + "language_loss": 0.77218843, + "learning_rate": 3.232749028292847e-09, + "loss": 0.79342312, + "num_input_tokens_seen": 176327415, + "step": 8170, + "time_per_iteration": 2.5204882621765137 + }, + { + "auxiliary_loss_clip": 0.01165422, + "auxiliary_loss_mlp": 0.01025064, + "balance_loss_clip": 1.04566145, + "balance_loss_mlp": 1.01802504, + "epoch": 0.9825046594120123, + "flos": 21908166854400.0, + "grad_norm": 1.9804211903255802, + "language_loss": 0.88267243, + "learning_rate": 3.188628183992792e-09, + "loss": 0.90457731, + "num_input_tokens_seen": 176347680, + "step": 8171, + "time_per_iteration": 2.4180612564086914 + }, + { + "auxiliary_loss_clip": 0.01051799, + "auxiliary_loss_mlp": 0.01000784, + "balance_loss_clip": 1.00713468, + "balance_loss_mlp": 0.99993753, + "epoch": 0.9826249023026513, + "flos": 59494610718720.0, + "grad_norm": 0.7387942025385362, + "language_loss": 0.62611729, + "learning_rate": 3.1448102543902844e-09, + "loss": 0.64664316, + "num_input_tokens_seen": 176411595, + "step": 8172, + "time_per_iteration": 3.0106372833251953 + }, + { + "auxiliary_loss_clip": 0.01129078, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.0434792, + "balance_loss_mlp": 1.02008486, + "epoch": 0.9827451451932905, + "flos": 16071031296000.0, + "grad_norm": 1.821035821082583, + "language_loss": 0.67597413, + "learning_rate": 3.1012952461324515e-09, + "loss": 0.69754052, + "num_input_tokens_seen": 176430570, + "step": 8173, + "time_per_iteration": 3.326709270477295 + }, + { + "auxiliary_loss_clip": 0.01148025, + "auxiliary_loss_mlp": 0.01026059, + "balance_loss_clip": 1.04706907, + "balance_loss_mlp": 1.01877487, + "epoch": 0.9828653880839295, + "flos": 20262775622400.0, + "grad_norm": 3.6820413674070247, + "language_loss": 0.73906165, + "learning_rate": 3.0580831658204575e-09, + "loss": 0.76080251, + "num_input_tokens_seen": 176448150, + "step": 8174, + "time_per_iteration": 2.43330454826355 + }, + { + "auxiliary_loss_clip": 0.01149705, + "auxiliary_loss_mlp": 0.01021411, + "balance_loss_clip": 1.047019, + "balance_loss_mlp": 1.0150249, + "epoch": 0.9829856309745686, + "flos": 21616141282560.0, + "grad_norm": 1.5922564052528934, + "language_loss": 0.77869344, + "learning_rate": 3.015174020009281e-09, + "loss": 0.80040467, + "num_input_tokens_seen": 176467475, + "step": 8175, + "time_per_iteration": 3.263957977294922 + }, + { + "auxiliary_loss_clip": 0.01126519, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.04298818, + "balance_loss_mlp": 1.01717329, + "epoch": 0.9831058738652078, + "flos": 23764209396480.0, + "grad_norm": 2.0496633053446307, + "language_loss": 0.74972546, + "learning_rate": 2.9725678152086043e-09, + "loss": 0.77122664, + "num_input_tokens_seen": 176486045, + "step": 8176, + "time_per_iteration": 3.312959909439087 + }, + { + "auxiliary_loss_clip": 0.01123988, + "auxiliary_loss_mlp": 0.01022397, + "balance_loss_clip": 1.0419023, + "balance_loss_mlp": 1.0152328, + "epoch": 0.9832261167558468, + "flos": 11320911072000.0, + "grad_norm": 2.5278741537232836, + "language_loss": 0.82737422, + "learning_rate": 2.930264557881257e-09, + "loss": 0.84883809, + "num_input_tokens_seen": 176501230, + "step": 8177, + "time_per_iteration": 2.418728828430176 + }, + { + "auxiliary_loss_clip": 0.0106086, + "auxiliary_loss_mlp": 0.01000673, + "balance_loss_clip": 1.0073384, + "balance_loss_mlp": 0.99982661, + "epoch": 0.9833463596464859, + "flos": 60000304343040.0, + "grad_norm": 0.8349583636822246, + "language_loss": 0.58210409, + "learning_rate": 2.8882642544452163e-09, + "loss": 0.60271943, + "num_input_tokens_seen": 176565955, + "step": 8178, + "time_per_iteration": 3.0307512283325195 + }, + { + "auxiliary_loss_clip": 0.01125384, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.04159105, + "balance_loss_mlp": 1.0206666, + "epoch": 0.983466602537125, + "flos": 13626699805440.0, + "grad_norm": 2.2724603447668277, + "language_loss": 0.74419934, + "learning_rate": 2.8465669112716083e-09, + "loss": 0.76573163, + "num_input_tokens_seen": 176583480, + "step": 8179, + "time_per_iteration": 2.4320054054260254 + }, + { + "auxiliary_loss_clip": 0.01151471, + "auxiliary_loss_mlp": 0.0076166, + "balance_loss_clip": 1.04367423, + "balance_loss_mlp": 1.00044596, + "epoch": 0.9835868454277641, + "flos": 22926844563840.0, + "grad_norm": 1.938651430601561, + "language_loss": 0.76275772, + "learning_rate": 2.8051725346858177e-09, + "loss": 0.78188902, + "num_input_tokens_seen": 176603740, + "step": 8180, + "time_per_iteration": 2.453141927719116 + }, + { + "auxiliary_loss_clip": 0.01167265, + "auxiliary_loss_mlp": 0.01025723, + "balance_loss_clip": 1.04661274, + "balance_loss_mlp": 1.01874924, + "epoch": 0.9837070883184031, + "flos": 27673409341440.0, + "grad_norm": 2.052482446452464, + "language_loss": 0.7064414, + "learning_rate": 2.7640811309674883e-09, + "loss": 0.72837126, + "num_input_tokens_seen": 176623240, + "step": 8181, + "time_per_iteration": 2.4584624767303467 + }, + { + "auxiliary_loss_clip": 0.01112655, + "auxiliary_loss_mlp": 0.01020801, + "balance_loss_clip": 1.04154074, + "balance_loss_mlp": 1.01393723, + "epoch": 0.9838273312090423, + "flos": 29241951425280.0, + "grad_norm": 1.6232823629135043, + "language_loss": 0.80797291, + "learning_rate": 2.7232927063498557e-09, + "loss": 0.82930744, + "num_input_tokens_seen": 176643615, + "step": 8182, + "time_per_iteration": 3.3102962970733643 + }, + { + "auxiliary_loss_clip": 0.01153901, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_clip": 1.04618919, + "balance_loss_mlp": 1.01408613, + "epoch": 0.9839475740996814, + "flos": 40110207304320.0, + "grad_norm": 6.551770927331249, + "language_loss": 0.68979466, + "learning_rate": 2.682807267020859e-09, + "loss": 0.71154368, + "num_input_tokens_seen": 176666375, + "step": 8183, + "time_per_iteration": 2.5814456939697266 + }, + { + "auxiliary_loss_clip": 0.01150134, + "auxiliary_loss_mlp": 0.01022598, + "balance_loss_clip": 1.04517221, + "balance_loss_mlp": 1.01510835, + "epoch": 0.9840678169903204, + "flos": 24169389788160.0, + "grad_norm": 1.522742496874214, + "language_loss": 0.62137973, + "learning_rate": 2.642624819121808e-09, + "loss": 0.643107, + "num_input_tokens_seen": 176686525, + "step": 8184, + "time_per_iteration": 2.433030605316162 + }, + { + "auxiliary_loss_clip": 0.01134684, + "auxiliary_loss_mlp": 0.01025279, + "balance_loss_clip": 1.04492819, + "balance_loss_mlp": 1.0182457, + "epoch": 0.9841880598809596, + "flos": 14684484447360.0, + "grad_norm": 1.9074219501131964, + "language_loss": 0.61766517, + "learning_rate": 2.6027453687487154e-09, + "loss": 0.63926476, + "num_input_tokens_seen": 176703615, + "step": 8185, + "time_per_iteration": 2.453526258468628 + }, + { + "auxiliary_loss_clip": 0.0113783, + "auxiliary_loss_mlp": 0.01022139, + "balance_loss_clip": 1.04499722, + "balance_loss_mlp": 1.01523411, + "epoch": 0.9843083027715986, + "flos": 22344768668160.0, + "grad_norm": 2.3130294478795603, + "language_loss": 0.54243672, + "learning_rate": 2.5631689219509643e-09, + "loss": 0.56403637, + "num_input_tokens_seen": 176722295, + "step": 8186, + "time_per_iteration": 2.466492176055908 + }, + { + "auxiliary_loss_clip": 0.01139525, + "auxiliary_loss_mlp": 0.01021497, + "balance_loss_clip": 1.04729521, + "balance_loss_mlp": 1.0150423, + "epoch": 0.9844285456622377, + "flos": 21800111765760.0, + "grad_norm": 1.7542054970354624, + "language_loss": 0.83476424, + "learning_rate": 2.523895484732197e-09, + "loss": 0.8563745, + "num_input_tokens_seen": 176741750, + "step": 8187, + "time_per_iteration": 2.4721720218658447 + }, + { + "auxiliary_loss_clip": 0.01155893, + "auxiliary_loss_mlp": 0.01024094, + "balance_loss_clip": 1.04543436, + "balance_loss_mlp": 1.01610994, + "epoch": 0.9845487885528769, + "flos": 18035380321920.0, + "grad_norm": 1.9993578250037793, + "language_loss": 0.74915093, + "learning_rate": 2.4849250630505357e-09, + "loss": 0.77095079, + "num_input_tokens_seen": 176759995, + "step": 8188, + "time_per_iteration": 2.4326727390289307 + }, + { + "auxiliary_loss_clip": 0.0106834, + "auxiliary_loss_mlp": 0.01025149, + "balance_loss_clip": 1.03526163, + "balance_loss_mlp": 1.01820219, + "epoch": 0.9846690314435159, + "flos": 25228610974080.0, + "grad_norm": 1.6861320855085145, + "language_loss": 0.73389876, + "learning_rate": 2.4462576628172528e-09, + "loss": 0.75483364, + "num_input_tokens_seen": 176778625, + "step": 8189, + "time_per_iteration": 2.6228549480438232 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.04652023, + "balance_loss_mlp": 1.02082777, + "epoch": 0.984789274334155, + "flos": 18552171248640.0, + "grad_norm": 2.8843873994560543, + "language_loss": 0.74140555, + "learning_rate": 2.407893289898766e-09, + "loss": 0.76318121, + "num_input_tokens_seen": 176797655, + "step": 8190, + "time_per_iteration": 2.4206769466400146 + }, + { + "auxiliary_loss_clip": 0.01113867, + "auxiliary_loss_mlp": 0.01018266, + "balance_loss_clip": 1.03964412, + "balance_loss_mlp": 1.01115489, + "epoch": 0.984909517224794, + "flos": 27345437233920.0, + "grad_norm": 1.7752209905871499, + "language_loss": 0.84196013, + "learning_rate": 2.3698319501144202e-09, + "loss": 0.86328149, + "num_input_tokens_seen": 176818640, + "step": 8191, + "time_per_iteration": 2.5583198070526123 + }, + { + "auxiliary_loss_clip": 0.01156865, + "auxiliary_loss_mlp": 0.01025094, + "balance_loss_clip": 1.0452652, + "balance_loss_mlp": 1.01747036, + "epoch": 0.9850297601154332, + "flos": 18734058743040.0, + "grad_norm": 1.5630931861897777, + "language_loss": 0.73526227, + "learning_rate": 2.3320736492382644e-09, + "loss": 0.75708187, + "num_input_tokens_seen": 176837475, + "step": 8192, + "time_per_iteration": 2.4372599124908447 + }, + { + "auxiliary_loss_clip": 0.01162336, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.04683065, + "balance_loss_mlp": 1.01958275, + "epoch": 0.9851500030060723, + "flos": 22308247514880.0, + "grad_norm": 1.5268476826951913, + "language_loss": 0.68155867, + "learning_rate": 2.29461839299816e-09, + "loss": 0.70344484, + "num_input_tokens_seen": 176857190, + "step": 8193, + "time_per_iteration": 2.4199471473693848 + }, + { + "auxiliary_loss_clip": 0.01124452, + "auxiliary_loss_mlp": 0.01017563, + "balance_loss_clip": 1.0418613, + "balance_loss_mlp": 1.01095009, + "epoch": 0.9852702458967113, + "flos": 26353691746560.0, + "grad_norm": 1.5149358253887673, + "language_loss": 0.80068064, + "learning_rate": 2.257466187076229e-09, + "loss": 0.82210076, + "num_input_tokens_seen": 176876395, + "step": 8194, + "time_per_iteration": 2.5408670902252197 + }, + { + "auxiliary_loss_clip": 0.01154064, + "auxiliary_loss_mlp": 0.00761708, + "balance_loss_clip": 1.04399788, + "balance_loss_mlp": 1.00036049, + "epoch": 0.9853904887873505, + "flos": 20883599314560.0, + "grad_norm": 2.049029767752672, + "language_loss": 0.71050161, + "learning_rate": 2.2206170371081854e-09, + "loss": 0.72965932, + "num_input_tokens_seen": 176894980, + "step": 8195, + "time_per_iteration": 2.443204402923584 + }, + { + "auxiliary_loss_clip": 0.01138112, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04334724, + "balance_loss_mlp": 1.01947975, + "epoch": 0.9855107316779895, + "flos": 25263444188160.0, + "grad_norm": 1.6394022493108442, + "language_loss": 0.85019565, + "learning_rate": 2.1840709486842247e-09, + "loss": 0.87185192, + "num_input_tokens_seen": 176914600, + "step": 8196, + "time_per_iteration": 2.506575107574463 + }, + { + "auxiliary_loss_clip": 0.01127804, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.04220319, + "balance_loss_mlp": 1.02022243, + "epoch": 0.9856309745686286, + "flos": 19062102677760.0, + "grad_norm": 1.9332385512072892, + "language_loss": 0.79136419, + "learning_rate": 2.1478279273481335e-09, + "loss": 0.81291664, + "num_input_tokens_seen": 176933085, + "step": 8197, + "time_per_iteration": 2.462186098098755 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01025565, + "balance_loss_clip": 1.04712677, + "balance_loss_mlp": 1.01867497, + "epoch": 0.9857512174592677, + "flos": 34130758060800.0, + "grad_norm": 2.1952292721048243, + "language_loss": 0.79617262, + "learning_rate": 2.1118879785981815e-09, + "loss": 0.81793523, + "num_input_tokens_seen": 176953225, + "step": 8198, + "time_per_iteration": 2.538031816482544 + }, + { + "auxiliary_loss_clip": 0.01133297, + "auxiliary_loss_mlp": 0.01022241, + "balance_loss_clip": 1.04383445, + "balance_loss_mlp": 1.01555085, + "epoch": 0.9858714603499068, + "flos": 25994693266560.0, + "grad_norm": 1.5733570963846992, + "language_loss": 0.79581743, + "learning_rate": 2.0762511078862288e-09, + "loss": 0.8173728, + "num_input_tokens_seen": 176973570, + "step": 8199, + "time_per_iteration": 3.349174737930298 + }, + { + "auxiliary_loss_clip": 0.01142479, + "auxiliary_loss_mlp": 0.01020954, + "balance_loss_clip": 1.0434587, + "balance_loss_mlp": 1.01426649, + "epoch": 0.9859917032405459, + "flos": 23696230907520.0, + "grad_norm": 2.171100552167529, + "language_loss": 0.65155751, + "learning_rate": 2.0409173206186183e-09, + "loss": 0.67319179, + "num_input_tokens_seen": 176992810, + "step": 8200, + "time_per_iteration": 2.4816854000091553 + }, + { + "auxiliary_loss_clip": 0.01121187, + "auxiliary_loss_mlp": 0.01023869, + "balance_loss_clip": 1.04552507, + "balance_loss_mlp": 1.01708949, + "epoch": 0.986111946131185, + "flos": 19938287134080.0, + "grad_norm": 4.955463796134859, + "language_loss": 0.87317407, + "learning_rate": 2.0058866221550617e-09, + "loss": 0.89462465, + "num_input_tokens_seen": 177011050, + "step": 8201, + "time_per_iteration": 3.305004596710205 + }, + { + "auxiliary_loss_clip": 0.01163263, + "auxiliary_loss_mlp": 0.0102051, + "balance_loss_clip": 1.04446673, + "balance_loss_mlp": 1.01359296, + "epoch": 0.9862321890218241, + "flos": 19828831415040.0, + "grad_norm": 2.128078414798084, + "language_loss": 0.75860846, + "learning_rate": 1.971159017809976e-09, + "loss": 0.78044623, + "num_input_tokens_seen": 177029340, + "step": 8202, + "time_per_iteration": 2.389552354812622 + }, + { + "auxiliary_loss_clip": 0.01149284, + "auxiliary_loss_mlp": 0.01024006, + "balance_loss_clip": 1.04506397, + "balance_loss_mlp": 1.01650786, + "epoch": 0.9863524319124631, + "flos": 21652051904640.0, + "grad_norm": 23.582536999610802, + "language_loss": 0.77674824, + "learning_rate": 1.93673451285159e-09, + "loss": 0.79848117, + "num_input_tokens_seen": 177048390, + "step": 8203, + "time_per_iteration": 3.1873929500579834 + }, + { + "auxiliary_loss_clip": 0.0104363, + "auxiliary_loss_mlp": 0.0100119, + "balance_loss_clip": 1.00771856, + "balance_loss_mlp": 1.00029624, + "epoch": 0.9864726748031023, + "flos": 52769977920000.0, + "grad_norm": 0.7331525055409038, + "language_loss": 0.56509274, + "learning_rate": 1.9026131125019495e-09, + "loss": 0.58554095, + "num_input_tokens_seen": 177105760, + "step": 8204, + "time_per_iteration": 2.98915958404541 + }, + { + "auxiliary_loss_clip": 0.01146008, + "auxiliary_loss_mlp": 0.01023233, + "balance_loss_clip": 1.04641187, + "balance_loss_mlp": 1.01645613, + "epoch": 0.9865929176937414, + "flos": 23364631526400.0, + "grad_norm": 1.6207546569319238, + "language_loss": 0.86899006, + "learning_rate": 1.8687948219371363e-09, + "loss": 0.89068246, + "num_input_tokens_seen": 177124985, + "step": 8205, + "time_per_iteration": 2.4355862140655518 + }, + { + "auxiliary_loss_clip": 0.01169421, + "auxiliary_loss_mlp": 0.01025656, + "balance_loss_clip": 1.04695034, + "balance_loss_mlp": 1.01823783, + "epoch": 0.9867131605843804, + "flos": 21616679986560.0, + "grad_norm": 1.9949169528993096, + "language_loss": 0.88505238, + "learning_rate": 1.835279646287491e-09, + "loss": 0.90700316, + "num_input_tokens_seen": 177142995, + "step": 8206, + "time_per_iteration": 2.3916306495666504 + }, + { + "auxiliary_loss_clip": 0.01157556, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.04673064, + "balance_loss_mlp": 1.02163994, + "epoch": 0.9868334034750196, + "flos": 22271403139200.0, + "grad_norm": 1.8474755622821888, + "language_loss": 0.76665807, + "learning_rate": 1.8020675906371685e-09, + "loss": 0.78852397, + "num_input_tokens_seen": 177162390, + "step": 8207, + "time_per_iteration": 2.433635950088501 + }, + { + "auxiliary_loss_clip": 0.01103672, + "auxiliary_loss_mlp": 0.01022467, + "balance_loss_clip": 1.03899336, + "balance_loss_mlp": 1.01545727, + "epoch": 0.9869536463656586, + "flos": 25809573548160.0, + "grad_norm": 2.0798946815339803, + "language_loss": 0.75366139, + "learning_rate": 1.7691586600243612e-09, + "loss": 0.77492273, + "num_input_tokens_seen": 177181290, + "step": 8208, + "time_per_iteration": 2.557999610900879 + }, + { + "auxiliary_loss_clip": 0.01137628, + "auxiliary_loss_mlp": 0.01024275, + "balance_loss_clip": 1.04889417, + "balance_loss_mlp": 1.01703644, + "epoch": 0.9870738892562977, + "flos": 16398500613120.0, + "grad_norm": 5.2309106437645605, + "language_loss": 0.86625189, + "learning_rate": 1.7365528594415202e-09, + "loss": 0.88787091, + "num_input_tokens_seen": 177195360, + "step": 8209, + "time_per_iteration": 3.213785171508789 + }, + { + "auxiliary_loss_clip": 0.01153724, + "auxiliary_loss_mlp": 0.00762035, + "balance_loss_clip": 1.04468632, + "balance_loss_mlp": 1.00045824, + "epoch": 0.9871941321469369, + "flos": 35481358373760.0, + "grad_norm": 9.45133272130327, + "language_loss": 0.67614359, + "learning_rate": 1.7042501938346888e-09, + "loss": 0.69530118, + "num_input_tokens_seen": 177218090, + "step": 8210, + "time_per_iteration": 2.5642542839050293 + }, + { + "auxiliary_loss_clip": 0.01124516, + "auxiliary_loss_mlp": 0.01021806, + "balance_loss_clip": 1.03919399, + "balance_loss_mlp": 1.01470113, + "epoch": 0.9873143750375759, + "flos": 21434217874560.0, + "grad_norm": 1.975584357556349, + "language_loss": 0.76455963, + "learning_rate": 1.6722506681043913e-09, + "loss": 0.7860229, + "num_input_tokens_seen": 177237050, + "step": 8211, + "time_per_iteration": 2.476716995239258 + }, + { + "auxiliary_loss_clip": 0.01140849, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.04503632, + "balance_loss_mlp": 1.0207485, + "epoch": 0.987434617928215, + "flos": 16326499800960.0, + "grad_norm": 2.979844503968214, + "language_loss": 0.68847156, + "learning_rate": 1.640554287104745e-09, + "loss": 0.71015596, + "num_input_tokens_seen": 177255325, + "step": 8212, + "time_per_iteration": 2.4656195640563965 + }, + { + "auxiliary_loss_clip": 0.01123628, + "auxiliary_loss_mlp": 0.01023534, + "balance_loss_clip": 1.03853714, + "balance_loss_mlp": 1.01629519, + "epoch": 0.9875548608188541, + "flos": 17851984456320.0, + "grad_norm": 2.073369511657995, + "language_loss": 0.79677486, + "learning_rate": 1.609161055644348e-09, + "loss": 0.81824642, + "num_input_tokens_seen": 177271250, + "step": 8213, + "time_per_iteration": 2.4907920360565186 + }, + { + "auxiliary_loss_clip": 0.01158763, + "auxiliary_loss_mlp": 0.01023699, + "balance_loss_clip": 1.04591489, + "balance_loss_mlp": 1.01667762, + "epoch": 0.9876751037094932, + "flos": 26132876887680.0, + "grad_norm": 2.0447305519179375, + "language_loss": 0.68622923, + "learning_rate": 1.5780709784849467e-09, + "loss": 0.70805389, + "num_input_tokens_seen": 177288270, + "step": 8214, + "time_per_iteration": 2.4699149131774902 + }, + { + "auxiliary_loss_clip": 0.01100066, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.0441165, + "balance_loss_mlp": 1.0185796, + "epoch": 0.9877953466001322, + "flos": 15991344973440.0, + "grad_norm": 2.0068672353271833, + "language_loss": 0.82483274, + "learning_rate": 1.5472840603436565e-09, + "loss": 0.8460896, + "num_input_tokens_seen": 177305500, + "step": 8215, + "time_per_iteration": 2.543728828430176 + }, + { + "auxiliary_loss_clip": 0.01141856, + "auxiliary_loss_mlp": 0.01024319, + "balance_loss_clip": 1.04618812, + "balance_loss_mlp": 1.0172441, + "epoch": 0.9879155894907714, + "flos": 18806777827200.0, + "grad_norm": 1.9405801073787239, + "language_loss": 0.77950031, + "learning_rate": 1.5168003058900757e-09, + "loss": 0.80116206, + "num_input_tokens_seen": 177323500, + "step": 8216, + "time_per_iteration": 2.4423890113830566 + }, + { + "auxiliary_loss_clip": 0.01121892, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.04090095, + "balance_loss_mlp": 1.01993763, + "epoch": 0.9880358323814105, + "flos": 22382044007040.0, + "grad_norm": 2.0017821861851766, + "language_loss": 0.9196167, + "learning_rate": 1.4866197197491715e-09, + "loss": 0.94110239, + "num_input_tokens_seen": 177342860, + "step": 8217, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.01155507, + "auxiliary_loss_mlp": 0.0076251, + "balance_loss_clip": 1.04641223, + "balance_loss_mlp": 1.00047112, + "epoch": 0.9881560752720495, + "flos": 15668831733120.0, + "grad_norm": 2.9338304579527676, + "language_loss": 0.7857607, + "learning_rate": 1.4567423064988371e-09, + "loss": 0.80494094, + "num_input_tokens_seen": 177360210, + "step": 8218, + "time_per_iteration": 2.408687114715576 + }, + { + "auxiliary_loss_clip": 0.01165686, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.04594207, + "balance_loss_mlp": 1.01744914, + "epoch": 0.9882763181626887, + "flos": 21500113374720.0, + "grad_norm": 1.8667621797794887, + "language_loss": 0.77782595, + "learning_rate": 1.4271680706718913e-09, + "loss": 0.79972887, + "num_input_tokens_seen": 177377885, + "step": 8219, + "time_per_iteration": 2.394237518310547 + }, + { + "auxiliary_loss_clip": 0.01155444, + "auxiliary_loss_mlp": 0.01025989, + "balance_loss_clip": 1.04795313, + "balance_loss_mlp": 1.01845527, + "epoch": 0.9883965610533277, + "flos": 28034598551040.0, + "grad_norm": 1.6049216300576692, + "language_loss": 0.82374293, + "learning_rate": 1.3978970167543013e-09, + "loss": 0.84555721, + "num_input_tokens_seen": 177398065, + "step": 8220, + "time_per_iteration": 2.5051913261413574 + }, + { + "auxiliary_loss_clip": 0.01128108, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.04279625, + "balance_loss_mlp": 1.01836777, + "epoch": 0.9885168039439668, + "flos": 14098601710080.0, + "grad_norm": 2.4830200864720564, + "language_loss": 0.7796874, + "learning_rate": 1.3689291491867372e-09, + "loss": 0.80122483, + "num_input_tokens_seen": 177416380, + "step": 8221, + "time_per_iteration": 2.443425178527832 + }, + { + "auxiliary_loss_clip": 0.01165878, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.04621661, + "balance_loss_mlp": 1.02078021, + "epoch": 0.988637046834606, + "flos": 26432013352320.0, + "grad_norm": 2.1511575434542562, + "language_loss": 0.73536474, + "learning_rate": 1.3402644723636836e-09, + "loss": 0.75730836, + "num_input_tokens_seen": 177438410, + "step": 8222, + "time_per_iteration": 2.483613967895508 + }, + { + "auxiliary_loss_clip": 0.01131337, + "auxiliary_loss_mlp": 0.01023972, + "balance_loss_clip": 1.04553604, + "balance_loss_mlp": 1.01703703, + "epoch": 0.988757289725245, + "flos": 25229113764480.0, + "grad_norm": 1.9001244244780227, + "language_loss": 0.83645672, + "learning_rate": 1.311902990633218e-09, + "loss": 0.85800976, + "num_input_tokens_seen": 177457375, + "step": 8223, + "time_per_iteration": 2.4874227046966553 + }, + { + "auxiliary_loss_clip": 0.01130236, + "auxiliary_loss_mlp": 0.01019633, + "balance_loss_clip": 1.04057741, + "balance_loss_mlp": 1.01319551, + "epoch": 0.9888775326158841, + "flos": 26359042872960.0, + "grad_norm": 1.5520314851969768, + "language_loss": 0.71581125, + "learning_rate": 1.2838447082978987e-09, + "loss": 0.73730993, + "num_input_tokens_seen": 177478530, + "step": 8224, + "time_per_iteration": 2.5411577224731445 + }, + { + "auxiliary_loss_clip": 0.01146979, + "auxiliary_loss_mlp": 0.01022409, + "balance_loss_clip": 1.04357994, + "balance_loss_mlp": 1.01494384, + "epoch": 0.9889977755065231, + "flos": 24316120846080.0, + "grad_norm": 3.907379924692396, + "language_loss": 0.83664012, + "learning_rate": 1.2560896296143208e-09, + "loss": 0.85833395, + "num_input_tokens_seen": 177496995, + "step": 8225, + "time_per_iteration": 2.4394185543060303 + }, + { + "auxiliary_loss_clip": 0.01165436, + "auxiliary_loss_mlp": 0.01024812, + "balance_loss_clip": 1.04679155, + "balance_loss_mlp": 1.01777577, + "epoch": 0.9891180183971623, + "flos": 18951066760320.0, + "grad_norm": 2.118567691603205, + "language_loss": 0.82306933, + "learning_rate": 1.2286377587926722e-09, + "loss": 0.84497184, + "num_input_tokens_seen": 177513785, + "step": 8226, + "time_per_iteration": 3.2136430740356445 + }, + { + "auxiliary_loss_clip": 0.01163463, + "auxiliary_loss_mlp": 0.01022611, + "balance_loss_clip": 1.04537535, + "balance_loss_mlp": 1.01529193, + "epoch": 0.9892382612878013, + "flos": 26176580760960.0, + "grad_norm": 1.9185948701520856, + "language_loss": 0.74751675, + "learning_rate": 1.2014890999973992e-09, + "loss": 0.76937747, + "num_input_tokens_seen": 177530705, + "step": 8227, + "time_per_iteration": 2.427504062652588 + }, + { + "auxiliary_loss_clip": 0.01162274, + "auxiliary_loss_mlp": 0.01023804, + "balance_loss_clip": 1.04502368, + "balance_loss_mlp": 1.01721191, + "epoch": 0.9893585041784404, + "flos": 25449605400960.0, + "grad_norm": 1.5633670665264983, + "language_loss": 0.78335214, + "learning_rate": 1.1746436573472073e-09, + "loss": 0.80521291, + "num_input_tokens_seen": 177552440, + "step": 8228, + "time_per_iteration": 3.2761571407318115 + }, + { + "auxiliary_loss_clip": 0.01145328, + "auxiliary_loss_mlp": 0.01024164, + "balance_loss_clip": 1.04438376, + "balance_loss_mlp": 1.01721108, + "epoch": 0.9894787470690796, + "flos": 20189302352640.0, + "grad_norm": 1.9098205785811575, + "language_loss": 0.691966, + "learning_rate": 1.1481014349141726e-09, + "loss": 0.71366096, + "num_input_tokens_seen": 177569660, + "step": 8229, + "time_per_iteration": 2.446530818939209 + }, + { + "auxiliary_loss_clip": 0.01136167, + "auxiliary_loss_mlp": 0.01024777, + "balance_loss_clip": 1.0436995, + "balance_loss_mlp": 1.01718926, + "epoch": 0.9895989899597186, + "flos": 24644308435200.0, + "grad_norm": 1.822870311681976, + "language_loss": 0.84340888, + "learning_rate": 1.121862436724852e-09, + "loss": 0.86501831, + "num_input_tokens_seen": 177588500, + "step": 8230, + "time_per_iteration": 3.2739226818084717 + }, + { + "auxiliary_loss_clip": 0.01153257, + "auxiliary_loss_mlp": 0.01026766, + "balance_loss_clip": 1.04843354, + "balance_loss_mlp": 1.01961923, + "epoch": 0.9897192328503577, + "flos": 21799034357760.0, + "grad_norm": 1.7222976887879071, + "language_loss": 0.70558274, + "learning_rate": 1.0959266667598388e-09, + "loss": 0.72738296, + "num_input_tokens_seen": 177607315, + "step": 8231, + "time_per_iteration": 2.500082015991211 + }, + { + "auxiliary_loss_clip": 0.01127318, + "auxiliary_loss_mlp": 0.01026287, + "balance_loss_clip": 1.04540443, + "balance_loss_mlp": 1.01870525, + "epoch": 0.9898394757409968, + "flos": 21325229032320.0, + "grad_norm": 1.9228778199221135, + "language_loss": 0.74942732, + "learning_rate": 1.0702941289533196e-09, + "loss": 0.77096337, + "num_input_tokens_seen": 177625990, + "step": 8232, + "time_per_iteration": 2.558720827102661 + }, + { + "auxiliary_loss_clip": 0.01120073, + "auxiliary_loss_mlp": 0.01024423, + "balance_loss_clip": 1.04184389, + "balance_loss_mlp": 1.0182476, + "epoch": 0.9899597186316359, + "flos": 18545024442240.0, + "grad_norm": 2.152609881846572, + "language_loss": 0.88654989, + "learning_rate": 1.0449648271939615e-09, + "loss": 0.90799487, + "num_input_tokens_seen": 177642335, + "step": 8233, + "time_per_iteration": 2.496023654937744 + }, + { + "auxiliary_loss_clip": 0.01116254, + "auxiliary_loss_mlp": 0.00762187, + "balance_loss_clip": 1.04606092, + "balance_loss_mlp": 1.00047374, + "epoch": 0.990079961522275, + "flos": 23766723348480.0, + "grad_norm": 1.4953693311670235, + "language_loss": 0.72706974, + "learning_rate": 1.0199387653240243e-09, + "loss": 0.7458542, + "num_input_tokens_seen": 177662025, + "step": 8234, + "time_per_iteration": 2.562835454940796 + }, + { + "auxiliary_loss_clip": 0.01131009, + "auxiliary_loss_mlp": 0.01021564, + "balance_loss_clip": 1.04393375, + "balance_loss_mlp": 1.01464093, + "epoch": 0.9902002044129141, + "flos": 16399182971520.0, + "grad_norm": 1.671507168851399, + "language_loss": 0.70949006, + "learning_rate": 9.952159471400267e-10, + "loss": 0.73101586, + "num_input_tokens_seen": 177679065, + "step": 8235, + "time_per_iteration": 3.1519935131073 + }, + { + "auxiliary_loss_clip": 0.01153063, + "auxiliary_loss_mlp": 0.00761591, + "balance_loss_clip": 1.04518795, + "balance_loss_mlp": 1.0004065, + "epoch": 0.9903204473035532, + "flos": 22559657783040.0, + "grad_norm": 1.8935456654621574, + "language_loss": 0.84576666, + "learning_rate": 9.707963763923022e-10, + "loss": 0.86491323, + "num_input_tokens_seen": 177698115, + "step": 8236, + "time_per_iteration": 2.517108917236328 + }, + { + "auxiliary_loss_clip": 0.01132951, + "auxiliary_loss_mlp": 0.01021913, + "balance_loss_clip": 1.04107857, + "balance_loss_mlp": 1.01476312, + "epoch": 0.9904406901941922, + "flos": 16144001775360.0, + "grad_norm": 1.9080028037985008, + "language_loss": 0.79011548, + "learning_rate": 9.466800567854427e-10, + "loss": 0.8116641, + "num_input_tokens_seen": 177716715, + "step": 8237, + "time_per_iteration": 2.46001935005188 + }, + { + "auxiliary_loss_clip": 0.0112425, + "auxiliary_loss_mlp": 0.0102061, + "balance_loss_clip": 1.03991556, + "balance_loss_mlp": 1.01293325, + "epoch": 0.9905609330848314, + "flos": 26651499408000.0, + "grad_norm": 3.4816435553339224, + "language_loss": 0.68279338, + "learning_rate": 9.228669919778553e-10, + "loss": 0.70424193, + "num_input_tokens_seen": 177735640, + "step": 8238, + "time_per_iteration": 2.5612010955810547 + }, + { + "auxiliary_loss_clip": 0.01131846, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.04183245, + "balance_loss_mlp": 1.01966715, + "epoch": 0.9906811759754705, + "flos": 23111820627840.0, + "grad_norm": 2.039818090992299, + "language_loss": 0.79856187, + "learning_rate": 8.993571855817617e-10, + "loss": 0.82015508, + "num_input_tokens_seen": 177754470, + "step": 8239, + "time_per_iteration": 2.4819586277008057 + }, + { + "auxiliary_loss_clip": 0.01151675, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.04525828, + "balance_loss_mlp": 1.01685715, + "epoch": 0.9908014188661095, + "flos": 22090593052800.0, + "grad_norm": 3.4560413166188786, + "language_loss": 0.751387, + "learning_rate": 8.761506411638642e-10, + "loss": 0.77314222, + "num_input_tokens_seen": 177773935, + "step": 8240, + "time_per_iteration": 2.5067315101623535 + }, + { + "auxiliary_loss_clip": 0.0113594, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.04533064, + "balance_loss_mlp": 1.02366042, + "epoch": 0.9909216617567487, + "flos": 19242948677760.0, + "grad_norm": 1.7315299882238075, + "language_loss": 0.73853493, + "learning_rate": 8.53247362244236e-10, + "loss": 0.76020014, + "num_input_tokens_seen": 177792745, + "step": 8241, + "time_per_iteration": 2.50022292137146 + }, + { + "auxiliary_loss_clip": 0.01138599, + "auxiliary_loss_mlp": 0.01023076, + "balance_loss_clip": 1.04534411, + "balance_loss_mlp": 1.01628959, + "epoch": 0.9910419046473877, + "flos": 23621213352960.0, + "grad_norm": 1.728782356122078, + "language_loss": 0.68237358, + "learning_rate": 8.306473522976532e-10, + "loss": 0.70399034, + "num_input_tokens_seen": 177812150, + "step": 8242, + "time_per_iteration": 2.5199990272521973 + }, + { + "auxiliary_loss_clip": 0.01165397, + "auxiliary_loss_mlp": 0.01019958, + "balance_loss_clip": 1.0478332, + "balance_loss_mlp": 1.01259971, + "epoch": 0.9911621475380268, + "flos": 22711380831360.0, + "grad_norm": 1.7422668945760342, + "language_loss": 0.71756887, + "learning_rate": 8.083506147522623e-10, + "loss": 0.73942244, + "num_input_tokens_seen": 177831545, + "step": 8243, + "time_per_iteration": 2.4368674755096436 + }, + { + "auxiliary_loss_clip": 0.01144136, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.04296041, + "balance_loss_mlp": 1.01975846, + "epoch": 0.991282390428666, + "flos": 13516956777600.0, + "grad_norm": 2.071491621034792, + "language_loss": 0.853513, + "learning_rate": 7.863571529906909e-10, + "loss": 0.87522292, + "num_input_tokens_seen": 177847130, + "step": 8244, + "time_per_iteration": 2.428632974624634 + }, + { + "auxiliary_loss_clip": 0.01034688, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00752044, + "balance_loss_mlp": 1.00058353, + "epoch": 0.991402633319305, + "flos": 61830492071040.0, + "grad_norm": 0.7218832933836399, + "language_loss": 0.59645915, + "learning_rate": 7.646669703489372e-10, + "loss": 0.61682093, + "num_input_tokens_seen": 177911440, + "step": 8245, + "time_per_iteration": 3.1910464763641357 + }, + { + "auxiliary_loss_clip": 0.01086141, + "auxiliary_loss_mlp": 0.01021818, + "balance_loss_clip": 1.04061437, + "balance_loss_mlp": 1.01485324, + "epoch": 0.9915228762099441, + "flos": 18770148933120.0, + "grad_norm": 1.8423509444893138, + "language_loss": 0.57068819, + "learning_rate": 7.432800701177023e-10, + "loss": 0.59176779, + "num_input_tokens_seen": 177929440, + "step": 8246, + "time_per_iteration": 2.581807851791382 + }, + { + "auxiliary_loss_clip": 0.01043244, + "auxiliary_loss_mlp": 0.01001444, + "balance_loss_clip": 1.00897479, + "balance_loss_mlp": 1.00056732, + "epoch": 0.9916431191005832, + "flos": 65936660244480.0, + "grad_norm": 0.7936290014642606, + "language_loss": 0.57828838, + "learning_rate": 7.221964555415017e-10, + "loss": 0.59873521, + "num_input_tokens_seen": 177989100, + "step": 8247, + "time_per_iteration": 3.0172791481018066 + }, + { + "auxiliary_loss_clip": 0.01135196, + "auxiliary_loss_mlp": 0.01019479, + "balance_loss_clip": 1.04408693, + "balance_loss_mlp": 1.01303303, + "epoch": 0.9917633619912223, + "flos": 16581573256320.0, + "grad_norm": 1.7519676958283414, + "language_loss": 0.75069213, + "learning_rate": 7.01416129818222e-10, + "loss": 0.77223885, + "num_input_tokens_seen": 178006720, + "step": 8248, + "time_per_iteration": 2.4533557891845703 + }, + { + "auxiliary_loss_clip": 0.01129357, + "auxiliary_loss_mlp": 0.01024999, + "balance_loss_clip": 1.0436641, + "balance_loss_mlp": 1.01781344, + "epoch": 0.9918836048818613, + "flos": 25411108999680.0, + "grad_norm": 1.824356976086391, + "language_loss": 0.58724999, + "learning_rate": 6.809390961006745e-10, + "loss": 0.6087935, + "num_input_tokens_seen": 178026850, + "step": 8249, + "time_per_iteration": 2.563532590866089 + }, + { + "auxiliary_loss_clip": 0.0113507, + "auxiliary_loss_mlp": 0.01027173, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.02013946, + "epoch": 0.9920038477725005, + "flos": 25046867134080.0, + "grad_norm": 1.8895234176989288, + "language_loss": 0.68826032, + "learning_rate": 6.607653574948191e-10, + "loss": 0.70988274, + "num_input_tokens_seen": 178047630, + "step": 8250, + "time_per_iteration": 2.498539686203003 + }, + { + "auxiliary_loss_clip": 0.01142, + "auxiliary_loss_mlp": 0.01024422, + "balance_loss_clip": 1.04175401, + "balance_loss_mlp": 1.01758504, + "epoch": 0.9921240906631396, + "flos": 21829773421440.0, + "grad_norm": 2.000025237290295, + "language_loss": 0.81559461, + "learning_rate": 6.408949170613187e-10, + "loss": 0.83725882, + "num_input_tokens_seen": 178066895, + "step": 8251, + "time_per_iteration": 2.4991910457611084 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01024816, + "balance_loss_clip": 1.04371572, + "balance_loss_mlp": 1.01703203, + "epoch": 0.9922443335537786, + "flos": 24864225454080.0, + "grad_norm": 2.615890616532026, + "language_loss": 0.81753099, + "learning_rate": 6.213277778144288e-10, + "loss": 0.83913302, + "num_input_tokens_seen": 178088540, + "step": 8252, + "time_per_iteration": 2.5331714153289795 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.03925061, + "balance_loss_mlp": 1.02021778, + "epoch": 0.9923645764444178, + "flos": 21613088626560.0, + "grad_norm": 2.0226087257790306, + "language_loss": 0.6694026, + "learning_rate": 6.020639427224416e-10, + "loss": 0.69066203, + "num_input_tokens_seen": 178106185, + "step": 8253, + "time_per_iteration": 3.434929847717285 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01021989, + "balance_loss_clip": 1.04462194, + "balance_loss_mlp": 1.01495004, + "epoch": 0.9924848193350568, + "flos": 25001798544000.0, + "grad_norm": 2.0996007644077106, + "language_loss": 0.72443128, + "learning_rate": 5.831034147076864e-10, + "loss": 0.74601185, + "num_input_tokens_seen": 178123435, + "step": 8254, + "time_per_iteration": 2.504014730453491 + }, + { + "auxiliary_loss_clip": 0.01049314, + "auxiliary_loss_mlp": 0.01000677, + "balance_loss_clip": 1.00832915, + "balance_loss_mlp": 0.9997946, + "epoch": 0.9926050622256959, + "flos": 68912543151360.0, + "grad_norm": 0.6875357353503534, + "language_loss": 0.55787557, + "learning_rate": 5.644461966463065e-10, + "loss": 0.57837552, + "num_input_tokens_seen": 178191045, + "step": 8255, + "time_per_iteration": 4.022282361984253 + }, + { + "auxiliary_loss_clip": 0.01136333, + "auxiliary_loss_mlp": 0.0102066, + "balance_loss_clip": 1.0449394, + "balance_loss_mlp": 1.01431227, + "epoch": 0.9927253051163349, + "flos": 20923675914240.0, + "grad_norm": 1.7225020069692987, + "language_loss": 0.75699162, + "learning_rate": 5.460922913687049e-10, + "loss": 0.77856159, + "num_input_tokens_seen": 178210135, + "step": 8256, + "time_per_iteration": 3.2249083518981934 + }, + { + "auxiliary_loss_clip": 0.01107072, + "auxiliary_loss_mlp": 0.00762176, + "balance_loss_clip": 1.03843856, + "balance_loss_mlp": 1.00043488, + "epoch": 0.9928455480069741, + "flos": 22308211601280.0, + "grad_norm": 2.286550897856297, + "language_loss": 0.74978691, + "learning_rate": 5.280417016593208e-10, + "loss": 0.76847941, + "num_input_tokens_seen": 178229925, + "step": 8257, + "time_per_iteration": 2.6291749477386475 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.00760879, + "balance_loss_clip": 1.04695773, + "balance_loss_mlp": 1.00039744, + "epoch": 0.9929657908976132, + "flos": 17383889393280.0, + "grad_norm": 1.732674229424684, + "language_loss": 0.74804699, + "learning_rate": 5.102944302559642e-10, + "loss": 0.76713943, + "num_input_tokens_seen": 178247420, + "step": 8258, + "time_per_iteration": 2.442061424255371 + }, + { + "auxiliary_loss_clip": 0.01104334, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.04263341, + "balance_loss_mlp": 1.01882112, + "epoch": 0.9930860337882522, + "flos": 22674680110080.0, + "grad_norm": 3.710278410150471, + "language_loss": 0.79882371, + "learning_rate": 4.9285047985137e-10, + "loss": 0.82012951, + "num_input_tokens_seen": 178266840, + "step": 8259, + "time_per_iteration": 2.6384880542755127 + }, + { + "auxiliary_loss_clip": 0.01153456, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.04535997, + "balance_loss_mlp": 1.02107477, + "epoch": 0.9932062766788914, + "flos": 28147789284480.0, + "grad_norm": 1.6430070788512317, + "language_loss": 0.74637204, + "learning_rate": 4.757098530916436e-10, + "loss": 0.76818681, + "num_input_tokens_seen": 178287285, + "step": 8260, + "time_per_iteration": 2.5147809982299805 + }, + { + "auxiliary_loss_clip": 0.01156336, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.04876614, + "balance_loss_mlp": 1.0181278, + "epoch": 0.9933265195695304, + "flos": 20156659868160.0, + "grad_norm": 3.782602774906576, + "language_loss": 0.77083755, + "learning_rate": 4.5887255257670563e-10, + "loss": 0.79265428, + "num_input_tokens_seen": 178304325, + "step": 8261, + "time_per_iteration": 2.4801785945892334 + }, + { + "auxiliary_loss_clip": 0.01166238, + "auxiliary_loss_mlp": 0.01027377, + "balance_loss_clip": 1.04711235, + "balance_loss_mlp": 1.0200752, + "epoch": 0.9934467624601695, + "flos": 21362037494400.0, + "grad_norm": 2.6216314162661725, + "language_loss": 0.7667163, + "learning_rate": 4.4233858086117906e-10, + "loss": 0.78865242, + "num_input_tokens_seen": 178322850, + "step": 8262, + "time_per_iteration": 3.1451010704040527 + }, + { + "auxiliary_loss_clip": 0.0110727, + "auxiliary_loss_mlp": 0.01025509, + "balance_loss_clip": 1.0444746, + "balance_loss_mlp": 1.01830852, + "epoch": 0.9935670053508087, + "flos": 19756040503680.0, + "grad_norm": 2.265969735330147, + "language_loss": 0.67954516, + "learning_rate": 4.261079404528356e-10, + "loss": 0.70087296, + "num_input_tokens_seen": 178342330, + "step": 8263, + "time_per_iteration": 2.5545506477355957 + }, + { + "auxiliary_loss_clip": 0.01148542, + "auxiliary_loss_mlp": 0.01020643, + "balance_loss_clip": 1.04423273, + "balance_loss_mlp": 1.01347888, + "epoch": 0.9936872482414477, + "flos": 21978838863360.0, + "grad_norm": 3.2466274623058737, + "language_loss": 0.69114614, + "learning_rate": 4.1018063381437205e-10, + "loss": 0.71283805, + "num_input_tokens_seen": 178362715, + "step": 8264, + "time_per_iteration": 2.4658918380737305 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.01002243, + "balance_loss_clip": 1.01047111, + "balance_loss_mlp": 1.00147402, + "epoch": 0.9938074911320868, + "flos": 69810667839360.0, + "grad_norm": 0.8778695489876152, + "language_loss": 0.61184144, + "learning_rate": 3.9455666336141167e-10, + "loss": 0.63233531, + "num_input_tokens_seen": 178426495, + "step": 8265, + "time_per_iteration": 3.1018972396850586 + }, + { + "auxiliary_loss_clip": 0.01164182, + "auxiliary_loss_mlp": 0.01023043, + "balance_loss_clip": 1.04696429, + "balance_loss_mlp": 1.01627243, + "epoch": 0.9939277340227259, + "flos": 15084170058240.0, + "grad_norm": 2.5755021350693985, + "language_loss": 0.83323324, + "learning_rate": 3.7923603146450267e-10, + "loss": 0.85510552, + "num_input_tokens_seen": 178442555, + "step": 8266, + "time_per_iteration": 2.384077787399292 + }, + { + "auxiliary_loss_clip": 0.01123074, + "auxiliary_loss_mlp": 0.01024825, + "balance_loss_clip": 1.03999043, + "balance_loss_mlp": 1.01764226, + "epoch": 0.994047976913365, + "flos": 17712364291200.0, + "grad_norm": 1.9986799115018, + "language_loss": 0.81031752, + "learning_rate": 3.642187404473418e-10, + "loss": 0.83179653, + "num_input_tokens_seen": 178460715, + "step": 8267, + "time_per_iteration": 2.5039939880371094 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01020048, + "balance_loss_clip": 1.0439105, + "balance_loss_mlp": 1.0130173, + "epoch": 0.994168219804004, + "flos": 19171558396800.0, + "grad_norm": 2.4422687537614896, + "language_loss": 0.86333156, + "learning_rate": 3.495047925885508e-10, + "loss": 0.88503867, + "num_input_tokens_seen": 178479050, + "step": 8268, + "time_per_iteration": 2.4145679473876953 + }, + { + "auxiliary_loss_clip": 0.01135162, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.04222631, + "balance_loss_mlp": 1.0183835, + "epoch": 0.9942884626946432, + "flos": 17851589406720.0, + "grad_norm": 3.280059971402832, + "language_loss": 0.82468426, + "learning_rate": 3.350941901199e-10, + "loss": 0.84628963, + "num_input_tokens_seen": 178495970, + "step": 8269, + "time_per_iteration": 2.4775407314300537 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.04442763, + "balance_loss_mlp": 1.01753485, + "epoch": 0.9944087055852823, + "flos": 18796578364800.0, + "grad_norm": 2.8330819347656204, + "language_loss": 0.83172882, + "learning_rate": 3.2098693522764066e-10, + "loss": 0.85337341, + "num_input_tokens_seen": 178509170, + "step": 8270, + "time_per_iteration": 2.444460868835449 + }, + { + "auxiliary_loss_clip": 0.01143707, + "auxiliary_loss_mlp": 0.00761732, + "balance_loss_clip": 1.04585123, + "balance_loss_mlp": 1.00047684, + "epoch": 0.9945289484759213, + "flos": 20996969616000.0, + "grad_norm": 1.8870604872235957, + "language_loss": 0.81237614, + "learning_rate": 3.071830300516165e-10, + "loss": 0.83143055, + "num_input_tokens_seen": 178527000, + "step": 8271, + "time_per_iteration": 2.5649683475494385 + }, + { + "auxiliary_loss_clip": 0.01155754, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.04585576, + "balance_loss_mlp": 1.02126396, + "epoch": 0.9946491913665605, + "flos": 14756952136320.0, + "grad_norm": 2.946529532764987, + "language_loss": 0.70839703, + "learning_rate": 2.9368247668615234e-10, + "loss": 0.73024082, + "num_input_tokens_seen": 178545590, + "step": 8272, + "time_per_iteration": 2.434903621673584 + }, + { + "auxiliary_loss_clip": 0.01169771, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.04889834, + "balance_loss_mlp": 1.01915324, + "epoch": 0.9947694342571995, + "flos": 12669931186560.0, + "grad_norm": 2.834484945376903, + "language_loss": 0.61655366, + "learning_rate": 2.804852771789434e-10, + "loss": 0.63851517, + "num_input_tokens_seen": 178558890, + "step": 8273, + "time_per_iteration": 2.3944907188415527 + }, + { + "auxiliary_loss_clip": 0.01162144, + "auxiliary_loss_mlp": 0.01023541, + "balance_loss_clip": 1.04530811, + "balance_loss_mlp": 1.01653194, + "epoch": 0.9948896771478386, + "flos": 18843442634880.0, + "grad_norm": 1.7999877857802915, + "language_loss": 0.55636847, + "learning_rate": 2.675914335321661e-10, + "loss": 0.57822537, + "num_input_tokens_seen": 178577645, + "step": 8274, + "time_per_iteration": 2.3823635578155518 + }, + { + "auxiliary_loss_clip": 0.01157135, + "auxiliary_loss_mlp": 0.01026259, + "balance_loss_clip": 1.0468781, + "balance_loss_mlp": 1.01828969, + "epoch": 0.9950099200384778, + "flos": 24900207903360.0, + "grad_norm": 2.2975960007290586, + "language_loss": 0.79673886, + "learning_rate": 2.550009477018111e-10, + "loss": 0.81857276, + "num_input_tokens_seen": 178596415, + "step": 8275, + "time_per_iteration": 2.5290284156799316 + }, + { + "auxiliary_loss_clip": 0.01137668, + "auxiliary_loss_mlp": 0.00762007, + "balance_loss_clip": 1.04595709, + "balance_loss_mlp": 1.00047469, + "epoch": 0.9951301629291168, + "flos": 23733613987200.0, + "grad_norm": 2.1993405447889796, + "language_loss": 0.62824106, + "learning_rate": 2.4271382159790634e-10, + "loss": 0.64723784, + "num_input_tokens_seen": 178613845, + "step": 8276, + "time_per_iteration": 2.50490665435791 + }, + { + "auxiliary_loss_clip": 0.01107082, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.04100072, + "balance_loss_mlp": 1.02220547, + "epoch": 0.9952504058197559, + "flos": 22236893147520.0, + "grad_norm": 1.7755456709311237, + "language_loss": 0.85643125, + "learning_rate": 2.3073005708429406e-10, + "loss": 0.87779218, + "num_input_tokens_seen": 178633490, + "step": 8277, + "time_per_iteration": 2.58384370803833 + }, + { + "auxiliary_loss_clip": 0.01122953, + "auxiliary_loss_mlp": 0.01020328, + "balance_loss_clip": 1.04578829, + "balance_loss_mlp": 1.01383066, + "epoch": 0.995370648710395, + "flos": 21211032718080.0, + "grad_norm": 1.7508562613524354, + "language_loss": 0.72135162, + "learning_rate": 2.190496559788535e-10, + "loss": 0.74278438, + "num_input_tokens_seen": 178651775, + "step": 8278, + "time_per_iteration": 2.5453786849975586 + }, + { + "auxiliary_loss_clip": 0.01135067, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.04354143, + "balance_loss_mlp": 1.02114177, + "epoch": 0.9954908916010341, + "flos": 14866731077760.0, + "grad_norm": 2.746550380378938, + "language_loss": 0.76418847, + "learning_rate": 2.0767262005372265e-10, + "loss": 0.78581727, + "num_input_tokens_seen": 178669290, + "step": 8279, + "time_per_iteration": 3.3816354274749756 + }, + { + "auxiliary_loss_clip": 0.01129969, + "auxiliary_loss_mlp": 0.01023569, + "balance_loss_clip": 1.04281509, + "balance_loss_mlp": 1.01700974, + "epoch": 0.9956111344916732, + "flos": 19208259118080.0, + "grad_norm": 2.048887141908911, + "language_loss": 0.74988109, + "learning_rate": 1.965989510346322e-10, + "loss": 0.77141643, + "num_input_tokens_seen": 178688410, + "step": 8280, + "time_per_iteration": 2.513462543487549 + }, + { + "auxiliary_loss_clip": 0.01105228, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.04150772, + "balance_loss_mlp": 1.01866937, + "epoch": 0.9957313773823123, + "flos": 20047060494720.0, + "grad_norm": 1.9790593245790244, + "language_loss": 0.71111608, + "learning_rate": 1.8582865060134955e-10, + "loss": 0.73243439, + "num_input_tokens_seen": 178706600, + "step": 8281, + "time_per_iteration": 3.2828047275543213 + }, + { + "auxiliary_loss_clip": 0.01060818, + "auxiliary_loss_mlp": 0.0100096, + "balance_loss_clip": 1.0073421, + "balance_loss_mlp": 1.00010133, + "epoch": 0.9958516202729514, + "flos": 57483253768320.0, + "grad_norm": 0.7859882120812638, + "language_loss": 0.557284, + "learning_rate": 1.7536172038790098e-10, + "loss": 0.57790178, + "num_input_tokens_seen": 178766910, + "step": 8282, + "time_per_iteration": 3.0593149662017822 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01020369, + "balance_loss_clip": 1.04611886, + "balance_loss_mlp": 1.01371384, + "epoch": 0.9959718631635904, + "flos": 27782900974080.0, + "grad_norm": 2.007162501087017, + "language_loss": 0.69224477, + "learning_rate": 1.651981619819054e-10, + "loss": 0.71382344, + "num_input_tokens_seen": 178784060, + "step": 8283, + "time_per_iteration": 3.3584096431732178 + }, + { + "auxiliary_loss_clip": 0.01113905, + "auxiliary_loss_mlp": 0.01023593, + "balance_loss_clip": 1.04202104, + "balance_loss_mlp": 1.0162971, + "epoch": 0.9960921060542296, + "flos": 24024095274240.0, + "grad_norm": 2.3053812037058274, + "language_loss": 0.71121198, + "learning_rate": 1.5533797692546257e-10, + "loss": 0.73258698, + "num_input_tokens_seen": 178802795, + "step": 8284, + "time_per_iteration": 2.545639753341675 + }, + { + "auxiliary_loss_clip": 0.01147523, + "auxiliary_loss_mlp": 0.01019912, + "balance_loss_clip": 1.0439508, + "balance_loss_mlp": 1.01255369, + "epoch": 0.9962123489448687, + "flos": 18697393935360.0, + "grad_norm": 1.9730891523720424, + "language_loss": 0.84074718, + "learning_rate": 1.4578116671404296e-10, + "loss": 0.86242151, + "num_input_tokens_seen": 178821075, + "step": 8285, + "time_per_iteration": 2.429304599761963 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.04730105, + "balance_loss_mlp": 1.02129698, + "epoch": 0.9963325918355077, + "flos": 20010754823040.0, + "grad_norm": 2.318298631824008, + "language_loss": 0.71324831, + "learning_rate": 1.3652773279759777e-10, + "loss": 0.7350018, + "num_input_tokens_seen": 178837725, + "step": 8286, + "time_per_iteration": 2.428987503051758 + }, + { + "auxiliary_loss_clip": 0.01149722, + "auxiliary_loss_mlp": 0.0102428, + "balance_loss_clip": 1.04419994, + "balance_loss_mlp": 1.01704144, + "epoch": 0.9964528347261468, + "flos": 33108488991360.0, + "grad_norm": 2.262626396535188, + "language_loss": 0.62857044, + "learning_rate": 1.2757767657989305e-10, + "loss": 0.65031046, + "num_input_tokens_seen": 178861515, + "step": 8287, + "time_per_iteration": 2.546614408493042 + }, + { + "auxiliary_loss_clip": 0.01149698, + "auxiliary_loss_mlp": 0.0101993, + "balance_loss_clip": 1.04598641, + "balance_loss_mlp": 1.01333213, + "epoch": 0.9965730776167859, + "flos": 23109342589440.0, + "grad_norm": 1.900974434634456, + "language_loss": 0.87091804, + "learning_rate": 1.1893099941850948e-10, + "loss": 0.89261436, + "num_input_tokens_seen": 178880410, + "step": 8288, + "time_per_iteration": 2.4448535442352295 + }, + { + "auxiliary_loss_clip": 0.01140624, + "auxiliary_loss_mlp": 0.01024703, + "balance_loss_clip": 1.04210591, + "balance_loss_mlp": 1.01733637, + "epoch": 0.996693320507425, + "flos": 22965843755520.0, + "grad_norm": 1.9484440571806665, + "language_loss": 0.77200848, + "learning_rate": 1.105877026252866e-10, + "loss": 0.79366171, + "num_input_tokens_seen": 178898740, + "step": 8289, + "time_per_iteration": 3.2521162033081055 + }, + { + "auxiliary_loss_clip": 0.01165879, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.04586184, + "balance_loss_mlp": 1.01809692, + "epoch": 0.996813563398064, + "flos": 13222740476160.0, + "grad_norm": 1.9755491691134537, + "language_loss": 0.71674532, + "learning_rate": 1.0254778746565663e-10, + "loss": 0.73866439, + "num_input_tokens_seen": 178914015, + "step": 8290, + "time_per_iteration": 2.352820634841919 + }, + { + "auxiliary_loss_clip": 0.01121647, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.04423678, + "balance_loss_mlp": 1.01964045, + "epoch": 0.9969338062887032, + "flos": 14647855553280.0, + "grad_norm": 2.1297054866249323, + "language_loss": 0.7353223, + "learning_rate": 9.481125515953259e-11, + "loss": 0.75680548, + "num_input_tokens_seen": 178932075, + "step": 8291, + "time_per_iteration": 2.52404522895813 + }, + { + "auxiliary_loss_clip": 0.01108891, + "auxiliary_loss_mlp": 0.01023996, + "balance_loss_clip": 1.03809643, + "balance_loss_mlp": 1.01663804, + "epoch": 0.9970540491793423, + "flos": 25735741142400.0, + "grad_norm": 1.607387266526498, + "language_loss": 0.79856205, + "learning_rate": 8.737810688064228e-11, + "loss": 0.81989086, + "num_input_tokens_seen": 178951910, + "step": 8292, + "time_per_iteration": 2.5697712898254395 + }, + { + "auxiliary_loss_clip": 0.01120101, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.04326797, + "balance_loss_mlp": 1.02142727, + "epoch": 0.9971742920699813, + "flos": 21470236237440.0, + "grad_norm": 2.0263255259910165, + "language_loss": 0.79260868, + "learning_rate": 8.024834375608414e-11, + "loss": 0.814116, + "num_input_tokens_seen": 178970500, + "step": 8293, + "time_per_iteration": 2.500955820083618 + }, + { + "auxiliary_loss_clip": 0.01060692, + "auxiliary_loss_mlp": 0.01000356, + "balance_loss_clip": 1.00724888, + "balance_loss_mlp": 0.99950927, + "epoch": 0.9972945349606205, + "flos": 72211223629440.0, + "grad_norm": 0.8280346749141044, + "language_loss": 0.6286484, + "learning_rate": 7.342196686788149e-11, + "loss": 0.64925897, + "num_input_tokens_seen": 179023665, + "step": 8294, + "time_per_iteration": 2.9057698249816895 + }, + { + "auxiliary_loss_clip": 0.01137953, + "auxiliary_loss_mlp": 0.01026779, + "balance_loss_clip": 1.04893553, + "balance_loss_mlp": 1.01976371, + "epoch": 0.9974147778512595, + "flos": 19678293515520.0, + "grad_norm": 1.9549901843514716, + "language_loss": 0.69124424, + "learning_rate": 6.689897725142834e-11, + "loss": 0.71289158, + "num_input_tokens_seen": 179043140, + "step": 8295, + "time_per_iteration": 2.4749598503112793 + }, + { + "auxiliary_loss_clip": 0.01138681, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.04306006, + "balance_loss_mlp": 1.020257, + "epoch": 0.9975350207418986, + "flos": 15960821391360.0, + "grad_norm": 2.2458612826183146, + "language_loss": 0.89006549, + "learning_rate": 6.067937589615545e-11, + "loss": 0.91172612, + "num_input_tokens_seen": 179061215, + "step": 8296, + "time_per_iteration": 2.4623379707336426 + }, + { + "auxiliary_loss_clip": 0.01044403, + "auxiliary_loss_mlp": 0.01001086, + "balance_loss_clip": 1.00851381, + "balance_loss_mlp": 1.00013244, + "epoch": 0.9976552636325378, + "flos": 59961879768960.0, + "grad_norm": 0.7528782873822731, + "language_loss": 0.57673573, + "learning_rate": 5.476316374575241e-11, + "loss": 0.59719062, + "num_input_tokens_seen": 179124700, + "step": 8297, + "time_per_iteration": 3.042626142501831 + }, + { + "auxiliary_loss_clip": 0.011653, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.04595232, + "balance_loss_mlp": 1.01693606, + "epoch": 0.9977755065231768, + "flos": 22487872452480.0, + "grad_norm": 1.8741773739445933, + "language_loss": 0.7285217, + "learning_rate": 4.9150341697723476e-11, + "loss": 0.7504217, + "num_input_tokens_seen": 179144590, + "step": 8298, + "time_per_iteration": 2.4386112689971924 + }, + { + "auxiliary_loss_clip": 0.01134111, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.04580164, + "balance_loss_mlp": 1.02175307, + "epoch": 0.9978957494138159, + "flos": 26030280666240.0, + "grad_norm": 1.5686091087729772, + "language_loss": 0.66557115, + "learning_rate": 4.384091060338768e-11, + "loss": 0.68720245, + "num_input_tokens_seen": 179165060, + "step": 8299, + "time_per_iteration": 2.5679140090942383 + }, + { + "auxiliary_loss_clip": 0.01147877, + "auxiliary_loss_mlp": 0.01024976, + "balance_loss_clip": 1.04495907, + "balance_loss_mlp": 1.01781726, + "epoch": 0.998015992304455, + "flos": 22637835734400.0, + "grad_norm": 2.3049236335459375, + "language_loss": 0.73332906, + "learning_rate": 3.883487126810081e-11, + "loss": 0.75505757, + "num_input_tokens_seen": 179184320, + "step": 8300, + "time_per_iteration": 2.5333354473114014 + }, + { + "auxiliary_loss_clip": 0.01140805, + "auxiliary_loss_mlp": 0.01022428, + "balance_loss_clip": 1.0424962, + "balance_loss_mlp": 1.01512599, + "epoch": 0.9981362351950941, + "flos": 18223444955520.0, + "grad_norm": 1.7308427971709435, + "language_loss": 0.79488659, + "learning_rate": 3.41322244516995e-11, + "loss": 0.8165189, + "num_input_tokens_seen": 179202265, + "step": 8301, + "time_per_iteration": 2.470193386077881 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.0102471, + "balance_loss_clip": 1.04121625, + "balance_loss_mlp": 1.0175364, + "epoch": 0.9982564780857331, + "flos": 33474095573760.0, + "grad_norm": 1.6369820164721491, + "language_loss": 0.63092965, + "learning_rate": 2.9732970866946925e-11, + "loss": 0.65215462, + "num_input_tokens_seen": 179222145, + "step": 8302, + "time_per_iteration": 2.629645347595215 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01023539, + "balance_loss_clip": 1.03783226, + "balance_loss_mlp": 1.01604652, + "epoch": 0.9983767209763723, + "flos": 15523465392000.0, + "grad_norm": 2.0318882216207843, + "language_loss": 0.78223205, + "learning_rate": 2.563711118175327e-11, + "loss": 0.80355501, + "num_input_tokens_seen": 179239030, + "step": 8303, + "time_per_iteration": 2.491415023803711 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.04409909, + "balance_loss_mlp": 1.01711535, + "epoch": 0.9984969638670114, + "flos": 19974377324160.0, + "grad_norm": 1.7846632600815349, + "language_loss": 0.83492184, + "learning_rate": 2.184464601717728e-11, + "loss": 0.85636377, + "num_input_tokens_seen": 179257345, + "step": 8304, + "time_per_iteration": 2.490250587463379 + }, + { + "auxiliary_loss_clip": 0.01156084, + "auxiliary_loss_mlp": 0.01023918, + "balance_loss_clip": 1.04848409, + "balance_loss_mlp": 1.01642227, + "epoch": 0.9986172067576504, + "flos": 20375750874240.0, + "grad_norm": 2.4936858363745684, + "language_loss": 0.78070009, + "learning_rate": 1.8355575948758585e-11, + "loss": 0.80250013, + "num_input_tokens_seen": 179275330, + "step": 8305, + "time_per_iteration": 2.4227540493011475 + }, + { + "auxiliary_loss_clip": 0.01134825, + "auxiliary_loss_mlp": 0.01024285, + "balance_loss_clip": 1.04143059, + "balance_loss_mlp": 1.01687062, + "epoch": 0.9987374496482896, + "flos": 23727903724800.0, + "grad_norm": 2.092696406343103, + "language_loss": 0.73691463, + "learning_rate": 1.5169901505407424e-11, + "loss": 0.75850576, + "num_input_tokens_seen": 179292395, + "step": 8306, + "time_per_iteration": 3.3225760459899902 + }, + { + "auxiliary_loss_clip": 0.01135571, + "auxiliary_loss_mlp": 0.01021153, + "balance_loss_clip": 1.04548454, + "balance_loss_mlp": 1.0143795, + "epoch": 0.9988576925389286, + "flos": 25044029959680.0, + "grad_norm": 3.150302437516206, + "language_loss": 0.74091977, + "learning_rate": 1.228762317073695e-11, + "loss": 0.76248705, + "num_input_tokens_seen": 179311225, + "step": 8307, + "time_per_iteration": 2.500269889831543 + }, + { + "auxiliary_loss_clip": 0.01138507, + "auxiliary_loss_mlp": 0.01023328, + "balance_loss_clip": 1.04582012, + "balance_loss_mlp": 1.01616979, + "epoch": 0.9989779354295677, + "flos": 31285627637760.0, + "grad_norm": 2.6241618388627823, + "language_loss": 0.79009187, + "learning_rate": 9.70874138195299e-12, + "loss": 0.81171024, + "num_input_tokens_seen": 179333135, + "step": 8308, + "time_per_iteration": 3.370873212814331 + }, + { + "auxiliary_loss_clip": 0.01165362, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.04553401, + "balance_loss_mlp": 1.01942658, + "epoch": 0.9990981783202069, + "flos": 19573398823680.0, + "grad_norm": 2.2544950448494077, + "language_loss": 0.74613214, + "learning_rate": 7.433256530076093e-12, + "loss": 0.76804924, + "num_input_tokens_seen": 179353090, + "step": 8309, + "time_per_iteration": 2.4445178508758545 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01021633, + "balance_loss_clip": 1.04081964, + "balance_loss_mlp": 1.01496065, + "epoch": 0.9992184212108459, + "flos": 17199667514880.0, + "grad_norm": 2.1625161308300807, + "language_loss": 0.75373125, + "learning_rate": 5.46116896038562e-12, + "loss": 0.77508187, + "num_input_tokens_seen": 179367500, + "step": 8310, + "time_per_iteration": 3.2368834018707275 + }, + { + "auxiliary_loss_clip": 0.0113511, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.04501009, + "balance_loss_mlp": 1.01980472, + "epoch": 0.999338664101485, + "flos": 46497853681920.0, + "grad_norm": 1.9497580178527667, + "language_loss": 0.61725223, + "learning_rate": 3.792478972197699e-12, + "loss": 0.63887244, + "num_input_tokens_seen": 179388085, + "step": 8311, + "time_per_iteration": 2.6791017055511475 + }, + { + "auxiliary_loss_clip": 0.01162768, + "auxiliary_loss_mlp": 0.01019195, + "balance_loss_clip": 1.04503465, + "balance_loss_mlp": 1.01223326, + "epoch": 0.9994589069921241, + "flos": 15158253859200.0, + "grad_norm": 3.0613174986916745, + "language_loss": 0.70076668, + "learning_rate": 2.4271868181990895e-12, + "loss": 0.72258627, + "num_input_tokens_seen": 179405250, + "step": 8312, + "time_per_iteration": 2.390583038330078 + }, + { + "auxiliary_loss_clip": 0.01152469, + "auxiliary_loss_mlp": 0.01023196, + "balance_loss_clip": 1.04528689, + "balance_loss_mlp": 1.01573968, + "epoch": 0.9995791498827632, + "flos": 12531460256640.0, + "grad_norm": 2.1285603506656257, + "language_loss": 0.81137228, + "learning_rate": 1.3652927060014973e-12, + "loss": 0.83312893, + "num_input_tokens_seen": 179420845, + "step": 8313, + "time_per_iteration": 2.4048800468444824 + }, + { + "auxiliary_loss_clip": 0.01129273, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.04499984, + "balance_loss_mlp": 1.01942825, + "epoch": 0.9996993927734023, + "flos": 19245175320960.0, + "grad_norm": 2.1174869671760064, + "language_loss": 0.6384474, + "learning_rate": 6.067967965872612e-13, + "loss": 0.6600129, + "num_input_tokens_seen": 179440455, + "step": 8314, + "time_per_iteration": 2.47021222114563 + }, + { + "auxiliary_loss_clip": 0.01121787, + "auxiliary_loss_mlp": 0.01026695, + "balance_loss_clip": 1.04453707, + "balance_loss_mlp": 1.01979291, + "epoch": 0.9998196356640414, + "flos": 62952804518400.0, + "grad_norm": 3.3841141806213604, + "language_loss": 0.77099526, + "learning_rate": 1.5169920497548615e-13, + "loss": 0.79248011, + "num_input_tokens_seen": 179465075, + "step": 8315, + "time_per_iteration": 3.634526014328003 + }, + { + "auxiliary_loss_clip": 0.01104206, + "auxiliary_loss_mlp": 0.01015116, + "balance_loss_clip": 1.02576852, + "balance_loss_mlp": 1.01107836, + "epoch": 0.9999398785546805, + "flos": 50922375073920.0, + "grad_norm": 1.0914396496542906, + "language_loss": 0.54964697, + "learning_rate": 0.0, + "loss": 0.57084024, + "num_input_tokens_seen": 179513955, + "step": 8316, + "time_per_iteration": 3.0176162719726562 + }, + { + "epoch": 0.9999398785546805, + "num_input_tokens_seen": 179513955, + "step": 8316, + "total_flos": 6.996749092776837e+17, + "train_loss": 0.7892607923968014, + "train_runtime": 23405.2918, + "train_samples_per_second": 14.213, + "train_steps_per_second": 0.355 + } + ], + "logging_steps": 1.0, + "max_steps": 8316, + "num_input_tokens_seen": 179513955, + "num_train_epochs": 1, + "save_steps": 1664, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.996749092776837e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}